1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
6 #include <sys/eventfd.h>
10 #include <sys/personality.h>
11 #include <sys/prctl.h>
13 #include <sys/types.h>
18 #include <linux/fs.h> /* Must be included after <sys/mount.h> */
21 #include <security/pam_appl.h>
25 #include <selinux/selinux.h>
29 #include <sys/apparmor.h>
32 #include "sd-messages.h"
35 #include "alloc-util.h"
37 #include "apparmor-util.h"
39 #include "argv-util.h"
43 #include "btrfs-util.h"
45 #include "capability-util.h"
46 #include "chattr-util.h"
47 #include "cgroup-setup.h"
49 #include "chown-recursive.h"
50 #include "constants.h"
51 #include "cpu-set-util.h"
52 #include "credential.h"
53 #include "data-fd-util.h"
56 #include "errno-list.h"
59 #include "exit-status.h"
61 #include "format-util.h"
62 #include "glob-util.h"
63 #include "hexdecoct.h"
65 #include "ioprio-util.h"
66 #include "lock-util.h"
70 #include "manager-dump.h"
71 #include "memory-util.h"
72 #include "missing_fs.h"
73 #include "missing_ioprio.h"
74 #include "missing_prctl.h"
75 #include "mkdir-label.h"
76 #include "namespace.h"
77 #include "parse-util.h"
78 #include "path-util.h"
79 #include "proc-cmdline.h"
80 #include "process-util.h"
82 #include "rlimit-util.h"
84 #include "seccomp-util.h"
85 #include "securebits-util.h"
86 #include "selinux-util.h"
87 #include "signal-util.h"
88 #include "smack-util.h"
89 #include "socket-util.h"
90 #include "sort-util.h"
92 #include "stat-util.h"
93 #include "string-table.h"
94 #include "string-util.h"
96 #include "syslog-util.h"
97 #include "terminal-util.h"
98 #include "tmpfile-util.h"
99 #include "umask-util.h"
100 #include "unit-serialize.h"
101 #include "user-util.h"
102 #include "utmp-wtmp.h"
104 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
105 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
107 #define SNDBUF_SIZE (8*1024*1024)
109 static int shift_fds(int fds
[], size_t n_fds
) {
113 /* Modifies the fds array! (sorts it) */
117 for (int start
= 0;;) {
118 int restart_from
= -1;
120 for (int i
= start
; i
< (int) n_fds
; i
++) {
123 /* Already at right index? */
127 nfd
= fcntl(fds
[i
], F_DUPFD
, i
+ 3);
134 /* Hmm, the fd we wanted isn't free? Then
135 * let's remember that and try again from here */
136 if (nfd
!= i
+3 && restart_from
< 0)
140 if (restart_from
< 0)
143 start
= restart_from
;
149 static int flags_fds(
162 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
163 * O_NONBLOCK only applies to socket activation though. */
165 for (size_t i
= 0; i
< n_fds
; i
++) {
167 if (i
< n_socket_fds
) {
168 r
= fd_nonblock(fds
[i
], nonblock
);
173 /* We unconditionally drop FD_CLOEXEC from the fds,
174 * since after all we want to pass these fds to our
177 r
= fd_cloexec(fds
[i
], false);
185 static const char *exec_context_tty_path(const ExecContext
*context
) {
188 if (context
->stdio_as_fds
)
191 if (context
->tty_path
)
192 return context
->tty_path
;
194 return "/dev/console";
197 static int exec_context_tty_size(const ExecContext
*context
, unsigned *ret_rows
, unsigned *ret_cols
) {
205 rows
= context
->tty_rows
;
206 cols
= context
->tty_cols
;
208 tty
= exec_context_tty_path(context
);
210 (void) proc_cmdline_tty_size(tty
, rows
== UINT_MAX
? &rows
: NULL
, cols
== UINT_MAX
? &cols
: NULL
);
218 static void exec_context_tty_reset(const ExecContext
*context
, const ExecParameters
*p
) {
219 _cleanup_close_
int fd
= -EBADF
;
220 const char *path
= exec_context_tty_path(ASSERT_PTR(context
));
222 /* Take a lock around the device for the duration of the setup that we do here.
223 * systemd-vconsole-setup.service also takes the lock to avoid being interrupted.
224 * We open a new fd that will be closed automatically, and operate on it for convenience.
227 if (p
&& p
->stdin_fd
>= 0) {
228 fd
= xopenat_lock(p
->stdin_fd
, NULL
,
229 O_RDONLY
|O_CLOEXEC
|O_NONBLOCK
|O_NOCTTY
, 0, 0, LOCK_BSD
, LOCK_EX
);
233 fd
= open_terminal(path
, O_RDWR
|O_NOCTTY
|O_CLOEXEC
|O_NONBLOCK
);
237 if (lock_generic(fd
, LOCK_BSD
, LOCK_EX
) < 0)
240 return; /* nothing to do */
242 if (context
->tty_vhangup
)
243 (void) terminal_vhangup_fd(fd
);
245 if (context
->tty_reset
)
246 (void) reset_terminal_fd(fd
, true);
248 if (p
&& p
->stdin_fd
>= 0) {
249 unsigned rows
= context
->tty_rows
, cols
= context
->tty_cols
;
251 (void) exec_context_tty_size(context
, &rows
, &cols
);
252 (void) terminal_set_size_fd(p
->stdin_fd
, path
, rows
, cols
);
255 if (context
->tty_vt_disallocate
&& path
)
256 (void) vt_disallocate(path
);
259 static bool is_terminal_input(ExecInput i
) {
262 EXEC_INPUT_TTY_FORCE
,
263 EXEC_INPUT_TTY_FAIL
);
266 static bool is_terminal_output(ExecOutput o
) {
269 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
270 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
);
273 static bool is_kmsg_output(ExecOutput o
) {
276 EXEC_OUTPUT_KMSG_AND_CONSOLE
);
279 static bool exec_context_needs_term(const ExecContext
*c
) {
282 /* Return true if the execution context suggests we should set $TERM to something useful. */
284 if (is_terminal_input(c
->std_input
))
287 if (is_terminal_output(c
->std_output
))
290 if (is_terminal_output(c
->std_error
))
293 return !!c
->tty_path
;
296 static int open_null_as(int flags
, int nfd
) {
301 fd
= open("/dev/null", flags
|O_NOCTTY
);
305 return move_fd(fd
, nfd
, false);
308 static int connect_journal_socket(
310 const char *log_namespace
,
314 uid_t olduid
= UID_INVALID
;
315 gid_t oldgid
= GID_INVALID
;
320 strjoina("/run/systemd/journal.", log_namespace
, "/stdout") :
321 "/run/systemd/journal/stdout";
323 if (gid_is_valid(gid
)) {
326 if (setegid(gid
) < 0)
330 if (uid_is_valid(uid
)) {
333 if (seteuid(uid
) < 0) {
339 r
= connect_unix_path(fd
, AT_FDCWD
, j
);
341 /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
342 an LSM interferes. */
344 if (uid_is_valid(uid
))
345 (void) seteuid(olduid
);
348 if (gid_is_valid(gid
))
349 (void) setegid(oldgid
);
354 static int connect_logger_as(
356 const ExecContext
*context
,
357 const ExecParameters
*params
,
364 _cleanup_close_
int fd
= -EBADF
;
369 assert(output
< _EXEC_OUTPUT_MAX
);
373 fd
= socket(AF_UNIX
, SOCK_STREAM
, 0);
377 r
= connect_journal_socket(fd
, context
->log_namespace
, uid
, gid
);
381 if (shutdown(fd
, SHUT_RD
) < 0)
384 (void) fd_inc_sndbuf(fd
, SNDBUF_SIZE
);
394 context
->syslog_identifier
?: ident
,
395 params
->flags
& EXEC_PASS_LOG_UNIT
? unit
->id
: "",
396 context
->syslog_priority
,
397 !!context
->syslog_level_prefix
,
399 is_kmsg_output(output
),
400 is_terminal_output(output
)) < 0)
403 return move_fd(TAKE_FD(fd
), nfd
, false);
406 static int open_terminal_as(const char *path
, int flags
, int nfd
) {
412 fd
= open_terminal(path
, flags
| O_NOCTTY
);
416 return move_fd(fd
, nfd
, false);
419 static int acquire_path(const char *path
, int flags
, mode_t mode
) {
420 _cleanup_close_
int fd
= -EBADF
;
425 if (IN_SET(flags
& O_ACCMODE
, O_WRONLY
, O_RDWR
))
428 fd
= open(path
, flags
|O_NOCTTY
, mode
);
432 if (errno
!= ENXIO
) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
435 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
437 fd
= socket(AF_UNIX
, SOCK_STREAM
, 0);
441 r
= connect_unix_path(fd
, AT_FDCWD
, path
);
442 if (IN_SET(r
, -ENOTSOCK
, -EINVAL
))
443 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
444 * wasn't an AF_UNIX socket after all */
449 if ((flags
& O_ACCMODE
) == O_RDONLY
)
450 r
= shutdown(fd
, SHUT_WR
);
451 else if ((flags
& O_ACCMODE
) == O_WRONLY
)
452 r
= shutdown(fd
, SHUT_RD
);
461 static int fixup_input(
462 const ExecContext
*context
,
464 bool apply_tty_stdin
) {
470 std_input
= context
->std_input
;
472 if (is_terminal_input(std_input
) && !apply_tty_stdin
)
473 return EXEC_INPUT_NULL
;
475 if (std_input
== EXEC_INPUT_SOCKET
&& socket_fd
< 0)
476 return EXEC_INPUT_NULL
;
478 if (std_input
== EXEC_INPUT_DATA
&& context
->stdin_data_size
== 0)
479 return EXEC_INPUT_NULL
;
484 static int fixup_output(ExecOutput output
, int socket_fd
) {
486 if (output
== EXEC_OUTPUT_SOCKET
&& socket_fd
< 0)
487 return EXEC_OUTPUT_INHERIT
;
492 static int setup_input(
493 const ExecContext
*context
,
494 const ExecParameters
*params
,
496 const int named_iofds
[static 3]) {
505 if (params
->stdin_fd
>= 0) {
506 if (dup2(params
->stdin_fd
, STDIN_FILENO
) < 0)
509 /* Try to make this the controlling tty, if it is a tty, and reset it */
510 if (isatty(STDIN_FILENO
)) {
511 unsigned rows
= context
->tty_rows
, cols
= context
->tty_cols
;
513 (void) exec_context_tty_size(context
, &rows
, &cols
);
514 (void) ioctl(STDIN_FILENO
, TIOCSCTTY
, context
->std_input
== EXEC_INPUT_TTY_FORCE
);
515 (void) reset_terminal_fd(STDIN_FILENO
, true);
516 (void) terminal_set_size_fd(STDIN_FILENO
, NULL
, rows
, cols
);
522 i
= fixup_input(context
, socket_fd
, params
->flags
& EXEC_APPLY_TTY_STDIN
);
526 case EXEC_INPUT_NULL
:
527 return open_null_as(O_RDONLY
, STDIN_FILENO
);
530 case EXEC_INPUT_TTY_FORCE
:
531 case EXEC_INPUT_TTY_FAIL
: {
535 fd
= acquire_terminal(exec_context_tty_path(context
),
536 i
== EXEC_INPUT_TTY_FAIL
? ACQUIRE_TERMINAL_TRY
:
537 i
== EXEC_INPUT_TTY_FORCE
? ACQUIRE_TERMINAL_FORCE
:
538 ACQUIRE_TERMINAL_WAIT
,
543 r
= exec_context_tty_size(context
, &rows
, &cols
);
547 r
= terminal_set_size_fd(fd
, exec_context_tty_path(context
), rows
, cols
);
551 return move_fd(fd
, STDIN_FILENO
, false);
554 case EXEC_INPUT_SOCKET
:
555 assert(socket_fd
>= 0);
557 return RET_NERRNO(dup2(socket_fd
, STDIN_FILENO
));
559 case EXEC_INPUT_NAMED_FD
:
560 assert(named_iofds
[STDIN_FILENO
] >= 0);
562 (void) fd_nonblock(named_iofds
[STDIN_FILENO
], false);
563 return RET_NERRNO(dup2(named_iofds
[STDIN_FILENO
], STDIN_FILENO
));
565 case EXEC_INPUT_DATA
: {
568 fd
= acquire_data_fd(context
->stdin_data
, context
->stdin_data_size
, 0);
572 return move_fd(fd
, STDIN_FILENO
, false);
575 case EXEC_INPUT_FILE
: {
579 assert(context
->stdio_file
[STDIN_FILENO
]);
581 rw
= (context
->std_output
== EXEC_OUTPUT_FILE
&& streq_ptr(context
->stdio_file
[STDIN_FILENO
], context
->stdio_file
[STDOUT_FILENO
])) ||
582 (context
->std_error
== EXEC_OUTPUT_FILE
&& streq_ptr(context
->stdio_file
[STDIN_FILENO
], context
->stdio_file
[STDERR_FILENO
]));
584 fd
= acquire_path(context
->stdio_file
[STDIN_FILENO
], rw
? O_RDWR
: O_RDONLY
, 0666 & ~context
->umask
);
588 return move_fd(fd
, STDIN_FILENO
, false);
592 assert_not_reached();
596 static bool can_inherit_stderr_from_stdout(
597 const ExecContext
*context
,
603 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
606 if (e
== EXEC_OUTPUT_INHERIT
)
611 if (e
== EXEC_OUTPUT_NAMED_FD
)
612 return streq_ptr(context
->stdio_fdname
[STDOUT_FILENO
], context
->stdio_fdname
[STDERR_FILENO
]);
614 if (IN_SET(e
, EXEC_OUTPUT_FILE
, EXEC_OUTPUT_FILE_APPEND
, EXEC_OUTPUT_FILE_TRUNCATE
))
615 return streq_ptr(context
->stdio_file
[STDOUT_FILENO
], context
->stdio_file
[STDERR_FILENO
]);
620 static int setup_output(
622 const ExecContext
*context
,
623 const ExecParameters
*params
,
626 const int named_iofds
[static 3],
630 dev_t
*journal_stream_dev
,
631 ino_t
*journal_stream_ino
) {
641 assert(journal_stream_dev
);
642 assert(journal_stream_ino
);
644 if (fileno
== STDOUT_FILENO
&& params
->stdout_fd
>= 0) {
646 if (dup2(params
->stdout_fd
, STDOUT_FILENO
) < 0)
649 return STDOUT_FILENO
;
652 if (fileno
== STDERR_FILENO
&& params
->stderr_fd
>= 0) {
653 if (dup2(params
->stderr_fd
, STDERR_FILENO
) < 0)
656 return STDERR_FILENO
;
659 i
= fixup_input(context
, socket_fd
, params
->flags
& EXEC_APPLY_TTY_STDIN
);
660 o
= fixup_output(context
->std_output
, socket_fd
);
662 if (fileno
== STDERR_FILENO
) {
664 e
= fixup_output(context
->std_error
, socket_fd
);
666 /* This expects the input and output are already set up */
668 /* Don't change the stderr file descriptor if we inherit all
669 * the way and are not on a tty */
670 if (e
== EXEC_OUTPUT_INHERIT
&&
671 o
== EXEC_OUTPUT_INHERIT
&&
672 i
== EXEC_INPUT_NULL
&&
673 !is_terminal_input(context
->std_input
) &&
677 /* Duplicate from stdout if possible */
678 if (can_inherit_stderr_from_stdout(context
, o
, e
))
679 return RET_NERRNO(dup2(STDOUT_FILENO
, fileno
));
683 } else if (o
== EXEC_OUTPUT_INHERIT
) {
684 /* If input got downgraded, inherit the original value */
685 if (i
== EXEC_INPUT_NULL
&& is_terminal_input(context
->std_input
))
686 return open_terminal_as(exec_context_tty_path(context
), O_WRONLY
, fileno
);
688 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
689 if (!IN_SET(i
, EXEC_INPUT_NULL
, EXEC_INPUT_DATA
))
690 return RET_NERRNO(dup2(STDIN_FILENO
, fileno
));
692 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
696 /* We need to open /dev/null here anew, to get the right access mode. */
697 return open_null_as(O_WRONLY
, fileno
);
702 case EXEC_OUTPUT_NULL
:
703 return open_null_as(O_WRONLY
, fileno
);
705 case EXEC_OUTPUT_TTY
:
706 if (is_terminal_input(i
))
707 return RET_NERRNO(dup2(STDIN_FILENO
, fileno
));
709 /* We don't reset the terminal if this is just about output */
710 return open_terminal_as(exec_context_tty_path(context
), O_WRONLY
, fileno
);
712 case EXEC_OUTPUT_KMSG
:
713 case EXEC_OUTPUT_KMSG_AND_CONSOLE
:
714 case EXEC_OUTPUT_JOURNAL
:
715 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE
:
716 r
= connect_logger_as(unit
, context
, params
, o
, ident
, fileno
, uid
, gid
);
718 log_unit_warning_errno(unit
, r
, "Failed to connect %s to the journal socket, ignoring: %m",
719 fileno
== STDOUT_FILENO
? "stdout" : "stderr");
720 r
= open_null_as(O_WRONLY
, fileno
);
724 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
725 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
726 * services to detect whether they are connected to the journal or not.
728 * If both stdout and stderr are connected to a stream then let's make sure to store the data
729 * about STDERR as that's usually the best way to do logging. */
731 if (fstat(fileno
, &st
) >= 0 &&
732 (*journal_stream_ino
== 0 || fileno
== STDERR_FILENO
)) {
733 *journal_stream_dev
= st
.st_dev
;
734 *journal_stream_ino
= st
.st_ino
;
739 case EXEC_OUTPUT_SOCKET
:
740 assert(socket_fd
>= 0);
742 return RET_NERRNO(dup2(socket_fd
, fileno
));
744 case EXEC_OUTPUT_NAMED_FD
:
745 assert(named_iofds
[fileno
] >= 0);
747 (void) fd_nonblock(named_iofds
[fileno
], false);
748 return RET_NERRNO(dup2(named_iofds
[fileno
], fileno
));
750 case EXEC_OUTPUT_FILE
:
751 case EXEC_OUTPUT_FILE_APPEND
:
752 case EXEC_OUTPUT_FILE_TRUNCATE
: {
756 assert(context
->stdio_file
[fileno
]);
758 rw
= context
->std_input
== EXEC_INPUT_FILE
&&
759 streq_ptr(context
->stdio_file
[fileno
], context
->stdio_file
[STDIN_FILENO
]);
762 return RET_NERRNO(dup2(STDIN_FILENO
, fileno
));
765 if (o
== EXEC_OUTPUT_FILE_APPEND
)
767 else if (o
== EXEC_OUTPUT_FILE_TRUNCATE
)
770 fd
= acquire_path(context
->stdio_file
[fileno
], flags
, 0666 & ~context
->umask
);
774 return move_fd(fd
, fileno
, 0);
778 assert_not_reached();
782 static int chown_terminal(int fd
, uid_t uid
) {
787 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
788 if (isatty(fd
) < 1) {
789 if (IN_SET(errno
, EINVAL
, ENOTTY
))
790 return 0; /* not a tty */
795 /* This might fail. What matters are the results. */
796 r
= fchmod_and_chown(fd
, TTY_MODE
, uid
, GID_INVALID
);
803 static int setup_confirm_stdio(
804 const ExecContext
*context
,
806 int *ret_saved_stdin
,
807 int *ret_saved_stdout
) {
809 _cleanup_close_
int fd
= -EBADF
, saved_stdin
= -EBADF
, saved_stdout
= -EBADF
;
813 assert(ret_saved_stdin
);
814 assert(ret_saved_stdout
);
816 saved_stdin
= fcntl(STDIN_FILENO
, F_DUPFD
, 3);
820 saved_stdout
= fcntl(STDOUT_FILENO
, F_DUPFD
, 3);
821 if (saved_stdout
< 0)
824 fd
= acquire_terminal(vc
, ACQUIRE_TERMINAL_WAIT
, DEFAULT_CONFIRM_USEC
);
828 r
= chown_terminal(fd
, getuid());
832 r
= reset_terminal_fd(fd
, true);
836 r
= exec_context_tty_size(context
, &rows
, &cols
);
840 r
= terminal_set_size_fd(fd
, vc
, rows
, cols
);
844 r
= rearrange_stdio(fd
, fd
, STDERR_FILENO
); /* Invalidates 'fd' also on failure */
849 *ret_saved_stdin
= TAKE_FD(saved_stdin
);
850 *ret_saved_stdout
= TAKE_FD(saved_stdout
);
854 static void write_confirm_error_fd(int err
, int fd
, const Unit
*u
) {
857 if (err
== -ETIMEDOUT
)
858 dprintf(fd
, "Confirmation question timed out for %s, assuming positive response.\n", u
->id
);
861 dprintf(fd
, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u
->id
);
865 static void write_confirm_error(int err
, const char *vc
, const Unit
*u
) {
866 _cleanup_close_
int fd
= -EBADF
;
870 fd
= open_terminal(vc
, O_WRONLY
|O_NOCTTY
|O_CLOEXEC
);
874 write_confirm_error_fd(err
, fd
, u
);
877 static int restore_confirm_stdio(int *saved_stdin
, int *saved_stdout
) {
881 assert(saved_stdout
);
885 if (*saved_stdin
>= 0)
886 if (dup2(*saved_stdin
, STDIN_FILENO
) < 0)
889 if (*saved_stdout
>= 0)
890 if (dup2(*saved_stdout
, STDOUT_FILENO
) < 0)
893 *saved_stdin
= safe_close(*saved_stdin
);
894 *saved_stdout
= safe_close(*saved_stdout
);
900 CONFIRM_PRETEND_FAILURE
= -1,
901 CONFIRM_PRETEND_SUCCESS
= 0,
905 static int ask_for_confirmation(const ExecContext
*context
, const char *vc
, Unit
*u
, const char *cmdline
) {
906 int saved_stdout
= -1, saved_stdin
= -1, r
;
907 _cleanup_free_
char *e
= NULL
;
910 /* For any internal errors, assume a positive response. */
911 r
= setup_confirm_stdio(context
, vc
, &saved_stdin
, &saved_stdout
);
913 write_confirm_error(r
, vc
, u
);
914 return CONFIRM_EXECUTE
;
917 /* confirm_spawn might have been disabled while we were sleeping. */
918 if (manager_is_confirm_spawn_disabled(u
->manager
)) {
923 e
= ellipsize(cmdline
, 60, 100);
931 r
= ask_char(&c
, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e
);
933 write_confirm_error_fd(r
, STDOUT_FILENO
, u
);
940 printf("Resuming normal execution.\n");
941 manager_disable_confirm_spawn();
945 unit_dump(u
, stdout
, " ");
946 continue; /* ask again */
948 printf("Failing execution.\n");
949 r
= CONFIRM_PRETEND_FAILURE
;
952 printf(" c - continue, proceed without asking anymore\n"
953 " D - dump, show the state of the unit\n"
954 " f - fail, don't execute the command and pretend it failed\n"
956 " i - info, show a short summary of the unit\n"
957 " j - jobs, show jobs that are in progress\n"
958 " s - skip, don't execute the command and pretend it succeeded\n"
959 " y - yes, execute the command\n");
960 continue; /* ask again */
962 printf(" Description: %s\n"
965 u
->id
, u
->description
, cmdline
);
966 continue; /* ask again */
968 manager_dump_jobs(u
->manager
, stdout
, /* patterns= */ NULL
, " ");
969 continue; /* ask again */
971 /* 'n' was removed in favor of 'f'. */
972 printf("Didn't understand 'n', did you mean 'f'?\n");
973 continue; /* ask again */
975 printf("Skipping execution.\n");
976 r
= CONFIRM_PRETEND_SUCCESS
;
982 assert_not_reached();
988 restore_confirm_stdio(&saved_stdin
, &saved_stdout
);
992 static int get_fixed_user(const ExecContext
*c
, const char **user
,
993 uid_t
*uid
, gid_t
*gid
,
994 const char **home
, const char **shell
) {
1003 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
1004 * (i.e. are "/" or "/bin/nologin"). */
1007 r
= get_user_creds(&name
, uid
, gid
, home
, shell
, USER_CREDS_CLEAN
);
1015 static int get_fixed_group(const ExecContext
*c
, const char **group
, gid_t
*gid
) {
1025 r
= get_group_creds(&name
, gid
, 0);
1033 static int get_supplementary_groups(const ExecContext
*c
, const char *user
,
1034 const char *group
, gid_t gid
,
1035 gid_t
**supplementary_gids
, int *ngids
) {
1038 bool keep_groups
= false;
1039 gid_t
*groups
= NULL
;
1040 _cleanup_free_ gid_t
*l_gids
= NULL
;
1045 * If user is given, then lookup GID and supplementary groups list.
1046 * We avoid NSS lookups for gid=0. Also we have to initialize groups
1047 * here and as early as possible so we keep the list of supplementary
1048 * groups of the caller.
1050 if (user
&& gid_is_valid(gid
) && gid
!= 0) {
1051 /* First step, initialize groups from /etc/groups */
1052 if (initgroups(user
, gid
) < 0)
1058 if (strv_isempty(c
->supplementary_groups
))
1062 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1063 * be positive, otherwise fail.
1066 ngroups_max
= (int) sysconf(_SC_NGROUPS_MAX
);
1067 if (ngroups_max
<= 0)
1068 return errno_or_else(EOPNOTSUPP
);
1070 l_gids
= new(gid_t
, ngroups_max
);
1076 * Lookup the list of groups that the user belongs to, we
1077 * avoid NSS lookups here too for gid=0.
1080 if (getgrouplist(user
, gid
, l_gids
, &k
) < 0)
1085 STRV_FOREACH(i
, c
->supplementary_groups
) {
1088 if (k
>= ngroups_max
)
1092 r
= get_group_creds(&g
, l_gids
+k
, 0);
1100 * Sets ngids to zero to drop all supplementary groups, happens
1101 * when we are under root and SupplementaryGroups= is empty.
1108 /* Otherwise get the final list of supplementary groups */
1109 groups
= memdup(l_gids
, sizeof(gid_t
) * k
);
1113 *supplementary_gids
= groups
;
1121 static int enforce_groups(gid_t gid
, const gid_t
*supplementary_gids
, int ngids
) {
1124 /* Handle SupplementaryGroups= if it is not empty */
1126 r
= maybe_setgroups(ngids
, supplementary_gids
);
1131 if (gid_is_valid(gid
)) {
1132 /* Then set our gids */
1133 if (setresgid(gid
, gid
, gid
) < 0)
1140 static int set_securebits(unsigned bits
, unsigned mask
) {
1144 current
= prctl(PR_GET_SECUREBITS
);
1148 /* Clear all securebits defined in mask and set bits */
1149 applied
= ((unsigned) current
& ~mask
) | bits
;
1150 if ((unsigned) current
== applied
)
1153 if (prctl(PR_SET_SECUREBITS
, applied
) < 0)
1159 static int enforce_user(
1160 const ExecContext
*context
,
1162 uint64_t capability_ambient_set
) {
1166 if (!uid_is_valid(uid
))
1169 /* Sets (but doesn't look up) the UIS and makes sure we keep the capabilities while doing so. For
1170 * setting secure bits the capability CAP_SETPCAP is required, so we also need keep-caps in this
1173 if ((capability_ambient_set
!= 0 || context
->secure_bits
!= 0) && uid
!= 0) {
1175 /* First step: If we need to keep capabilities but drop privileges we need to make sure we
1176 * keep our caps, while we drop privileges. Add KEEP_CAPS to the securebits */
1177 r
= set_securebits(1U << SECURE_KEEP_CAPS
, 0);
1182 /* Second step: actually set the uids */
1183 if (setresuid(uid
, uid
, uid
) < 0)
1186 /* At this point we should have all necessary capabilities but are otherwise a normal user. However,
1187 * the caps might got corrupted due to the setresuid() so we need clean them up later. This is done
1188 * outside of this call. */
1194 static int null_conv(
1196 const struct pam_message
**msg
,
1197 struct pam_response
**resp
,
1198 void *appdata_ptr
) {
1200 /* We don't support conversations */
1202 return PAM_CONV_ERR
;
1207 static int setup_pam(
1213 char ***env
, /* updated on success */
1214 const int fds
[], size_t n_fds
) {
1218 static const struct pam_conv conv
= {
1223 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
1224 _cleanup_strv_free_
char **e
= NULL
;
1225 pam_handle_t
*handle
= NULL
;
1227 int pam_code
= PAM_SUCCESS
, r
;
1228 bool close_session
= false;
1229 pid_t pam_pid
= 0, parent_pid
;
1236 /* We set up PAM in the parent process, then fork. The child
1237 * will then stay around until killed via PR_GET_PDEATHSIG or
1238 * systemd via the cgroup logic. It will then remove the PAM
1239 * session again. The parent process will exec() the actual
1240 * daemon. We do things this way to ensure that the main PID
1241 * of the daemon is the one we initially fork()ed. */
1243 r
= barrier_create(&barrier
);
1247 if (log_get_max_level() < LOG_DEBUG
)
1248 flags
|= PAM_SILENT
;
1250 pam_code
= pam_start(name
, user
, &conv
, &handle
);
1251 if (pam_code
!= PAM_SUCCESS
) {
1257 _cleanup_free_
char *q
= NULL
;
1259 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1260 * out if that's the case, and read the TTY off it. */
1262 if (getttyname_malloc(STDIN_FILENO
, &q
) >= 0)
1263 tty
= strjoina("/dev/", q
);
1267 pam_code
= pam_set_item(handle
, PAM_TTY
, tty
);
1268 if (pam_code
!= PAM_SUCCESS
)
1272 STRV_FOREACH(nv
, *env
) {
1273 pam_code
= pam_putenv(handle
, *nv
);
1274 if (pam_code
!= PAM_SUCCESS
)
1278 pam_code
= pam_acct_mgmt(handle
, flags
);
1279 if (pam_code
!= PAM_SUCCESS
)
1282 pam_code
= pam_setcred(handle
, PAM_ESTABLISH_CRED
| flags
);
1283 if (pam_code
!= PAM_SUCCESS
)
1284 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle
, pam_code
));
1286 pam_code
= pam_open_session(handle
, flags
);
1287 if (pam_code
!= PAM_SUCCESS
)
1290 close_session
= true;
1292 e
= pam_getenvlist(handle
);
1294 pam_code
= PAM_BUF_ERR
;
1298 /* Block SIGTERM, so that we know that it won't get lost in the child */
1300 assert_se(sigprocmask_many(SIG_BLOCK
, &old_ss
, SIGTERM
, -1) >= 0);
1302 parent_pid
= getpid_cached();
1304 r
= safe_fork("(sd-pam)", 0, &pam_pid
);
1308 int sig
, ret
= EXIT_PAM
;
1310 /* The child's job is to reset the PAM session on termination */
1311 barrier_set_role(&barrier
, BARRIER_CHILD
);
1313 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1314 * those fds are open here that have been opened by PAM. */
1315 (void) close_many(fds
, n_fds
);
1317 /* Drop privileges - we don't need any to pam_close_session and this will make
1318 * PR_SET_PDEATHSIG work in most cases. If this fails, ignore the error - but expect sd-pam
1319 * threads to fail to exit normally */
1321 r
= maybe_setgroups(0, NULL
);
1323 log_warning_errno(r
, "Failed to setgroups() in sd-pam: %m");
1324 if (setresgid(gid
, gid
, gid
) < 0)
1325 log_warning_errno(errno
, "Failed to setresgid() in sd-pam: %m");
1326 if (setresuid(uid
, uid
, uid
) < 0)
1327 log_warning_errno(errno
, "Failed to setresuid() in sd-pam: %m");
1329 (void) ignore_signals(SIGPIPE
);
1331 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1332 * otherwise the kernel will not allow unprivileged parents kill their privileged children
1333 * this way. We rely on the control groups kill logic to do the rest for us. */
1334 if (prctl(PR_SET_PDEATHSIG
, SIGTERM
) < 0)
1337 /* Tell the parent that our setup is done. This is especially important regarding dropping
1338 * privileges. Otherwise, unit setup might race against our setresuid(2) call.
1340 * If the parent aborted, we'll detect this below, hence ignore return failure here. */
1341 (void) barrier_place(&barrier
);
1343 /* Check if our parent process might already have died? */
1344 if (getppid() == parent_pid
) {
1347 assert_se(sigemptyset(&ss
) >= 0);
1348 assert_se(sigaddset(&ss
, SIGTERM
) >= 0);
1351 if (sigwait(&ss
, &sig
) < 0) {
1358 assert(sig
== SIGTERM
);
1363 pam_code
= pam_setcred(handle
, PAM_DELETE_CRED
| flags
);
1364 if (pam_code
!= PAM_SUCCESS
)
1367 /* If our parent died we'll end the session */
1368 if (getppid() != parent_pid
) {
1369 pam_code
= pam_close_session(handle
, flags
);
1370 if (pam_code
!= PAM_SUCCESS
)
1377 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1378 * know about this. See pam_end(3) */
1379 (void) pam_end(handle
, pam_code
| flags
| PAM_DATA_SILENT
);
1383 barrier_set_role(&barrier
, BARRIER_PARENT
);
1385 /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1389 /* Unblock SIGTERM again in the parent */
1390 assert_se(sigprocmask(SIG_SETMASK
, &old_ss
, NULL
) >= 0);
1392 /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1393 * this fd around. */
1396 /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1397 * recover. However, warn loudly if it happens. */
1398 if (!barrier_place_and_sync(&barrier
))
1399 log_error("PAM initialization failed");
1401 return strv_free_and_replace(*env
, e
);
1404 if (pam_code
!= PAM_SUCCESS
) {
1405 log_error("PAM failed: %s", pam_strerror(handle
, pam_code
));
1406 r
= -EPERM
; /* PAM errors do not map to errno */
1408 log_error_errno(r
, "PAM failed: %m");
1412 pam_code
= pam_close_session(handle
, flags
);
1414 (void) pam_end(handle
, pam_code
| flags
);
1424 static void rename_process_from_path(const char *path
) {
1425 _cleanup_free_
char *buf
= NULL
;
1430 /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
1433 if (path_extract_filename(path
, &buf
) < 0) {
1434 rename_process("(...)");
1438 size_t l
= strlen(buf
);
1440 /* The end of the process name is usually more interesting, since the first bit might just be
1447 char process_name
[11];
1448 process_name
[0] = '(';
1449 memcpy(process_name
+1, p
, l
);
1450 process_name
[1+l
] = ')';
1451 process_name
[1+l
+1] = 0;
1453 rename_process(process_name
);
1456 static bool context_has_address_families(const ExecContext
*c
) {
1459 return c
->address_families_allow_list
||
1460 !set_isempty(c
->address_families
);
1463 static bool context_has_syscall_filters(const ExecContext
*c
) {
1466 return c
->syscall_allow_list
||
1467 !hashmap_isempty(c
->syscall_filter
);
1470 static bool context_has_syscall_logs(const ExecContext
*c
) {
1473 return c
->syscall_log_allow_list
||
1474 !hashmap_isempty(c
->syscall_log
);
1477 static bool context_has_no_new_privileges(const ExecContext
*c
) {
1480 if (c
->no_new_privileges
)
1483 if (have_effective_cap(CAP_SYS_ADMIN
) > 0) /* if we are privileged, we don't need NNP */
1486 /* We need NNP if we have any form of seccomp and are unprivileged */
1487 return c
->lock_personality
||
1488 c
->memory_deny_write_execute
||
1489 c
->private_devices
||
1491 c
->protect_hostname
||
1492 c
->protect_kernel_tunables
||
1493 c
->protect_kernel_modules
||
1494 c
->protect_kernel_logs
||
1495 context_has_address_families(c
) ||
1496 exec_context_restrict_namespaces_set(c
) ||
1497 c
->restrict_realtime
||
1498 c
->restrict_suid_sgid
||
1499 !set_isempty(c
->syscall_archs
) ||
1500 context_has_syscall_filters(c
) ||
1501 context_has_syscall_logs(c
);
1506 static bool skip_seccomp_unavailable(const Unit
* u
, const char* msg
) {
1508 if (is_seccomp_available())
1511 log_unit_debug(u
, "SECCOMP features not detected in the kernel, skipping %s", msg
);
1515 static int apply_syscall_filter(const Unit
* u
, const ExecContext
*c
, bool needs_ambient_hack
) {
1516 uint32_t negative_action
, default_action
, action
;
1522 if (!context_has_syscall_filters(c
))
1525 if (skip_seccomp_unavailable(u
, "SystemCallFilter="))
1528 negative_action
= c
->syscall_errno
== SECCOMP_ERROR_NUMBER_KILL
? scmp_act_kill_process() : SCMP_ACT_ERRNO(c
->syscall_errno
);
1530 if (c
->syscall_allow_list
) {
1531 default_action
= negative_action
;
1532 action
= SCMP_ACT_ALLOW
;
1534 default_action
= SCMP_ACT_ALLOW
;
1535 action
= negative_action
;
1538 if (needs_ambient_hack
) {
1539 r
= seccomp_filter_set_add(c
->syscall_filter
, c
->syscall_allow_list
, syscall_filter_sets
+ SYSCALL_FILTER_SET_SETUID
);
1544 return seccomp_load_syscall_filter_set_raw(default_action
, c
->syscall_filter
, action
, false);
1547 static int apply_syscall_log(const Unit
* u
, const ExecContext
*c
) {
1549 uint32_t default_action
, action
;
1555 if (!context_has_syscall_logs(c
))
1559 if (skip_seccomp_unavailable(u
, "SystemCallLog="))
1562 if (c
->syscall_log_allow_list
) {
1563 /* Log nothing but the ones listed */
1564 default_action
= SCMP_ACT_ALLOW
;
1565 action
= SCMP_ACT_LOG
;
1567 /* Log everything but the ones listed */
1568 default_action
= SCMP_ACT_LOG
;
1569 action
= SCMP_ACT_ALLOW
;
1572 return seccomp_load_syscall_filter_set_raw(default_action
, c
->syscall_log
, action
, false);
1574 /* old libseccomp */
1575 log_unit_debug(u
, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1580 static int apply_syscall_archs(const Unit
*u
, const ExecContext
*c
) {
1584 if (set_isempty(c
->syscall_archs
))
1587 if (skip_seccomp_unavailable(u
, "SystemCallArchitectures="))
1590 return seccomp_restrict_archs(c
->syscall_archs
);
1593 static int apply_address_families(const Unit
* u
, const ExecContext
*c
) {
1597 if (!context_has_address_families(c
))
1600 if (skip_seccomp_unavailable(u
, "RestrictAddressFamilies="))
1603 return seccomp_restrict_address_families(c
->address_families
, c
->address_families_allow_list
);
1606 static int apply_memory_deny_write_execute(const Unit
* u
, const ExecContext
*c
) {
1612 if (!c
->memory_deny_write_execute
)
1615 /* use prctl() if kernel supports it (6.3) */
1616 r
= prctl(PR_SET_MDWE
, PR_MDWE_REFUSE_EXEC_GAIN
, 0, 0, 0);
1618 log_unit_debug(u
, "Enabled MemoryDenyWriteExecute= with PR_SET_MDWE");
1621 if (r
< 0 && errno
!= EINVAL
)
1622 return log_unit_debug_errno(u
, errno
, "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m");
1623 /* else use seccomp */
1624 log_unit_debug(u
, "Kernel doesn't support PR_SET_MDWE: falling back to seccomp");
1626 if (skip_seccomp_unavailable(u
, "MemoryDenyWriteExecute="))
1629 return seccomp_memory_deny_write_execute();
1632 static int apply_restrict_realtime(const Unit
* u
, const ExecContext
*c
) {
1636 if (!c
->restrict_realtime
)
1639 if (skip_seccomp_unavailable(u
, "RestrictRealtime="))
1642 return seccomp_restrict_realtime();
1645 static int apply_restrict_suid_sgid(const Unit
* u
, const ExecContext
*c
) {
1649 if (!c
->restrict_suid_sgid
)
1652 if (skip_seccomp_unavailable(u
, "RestrictSUIDSGID="))
1655 return seccomp_restrict_suid_sgid();
1658 static int apply_protect_sysctl(const Unit
*u
, const ExecContext
*c
) {
1662 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1663 * let's protect even those systems where this is left on in the kernel. */
1665 if (!c
->protect_kernel_tunables
)
1668 if (skip_seccomp_unavailable(u
, "ProtectKernelTunables="))
1671 return seccomp_protect_sysctl();
1674 static int apply_protect_kernel_modules(const Unit
*u
, const ExecContext
*c
) {
1678 /* Turn off module syscalls on ProtectKernelModules=yes */
1680 if (!c
->protect_kernel_modules
)
1683 if (skip_seccomp_unavailable(u
, "ProtectKernelModules="))
1686 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_MODULE
, SCMP_ACT_ERRNO(EPERM
), false);
1689 static int apply_protect_kernel_logs(const Unit
*u
, const ExecContext
*c
) {
1693 if (!c
->protect_kernel_logs
)
1696 if (skip_seccomp_unavailable(u
, "ProtectKernelLogs="))
1699 return seccomp_protect_syslog();
1702 static int apply_protect_clock(const Unit
*u
, const ExecContext
*c
) {
1706 if (!c
->protect_clock
)
1709 if (skip_seccomp_unavailable(u
, "ProtectClock="))
1712 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_CLOCK
, SCMP_ACT_ERRNO(EPERM
), false);
1715 static int apply_private_devices(const Unit
*u
, const ExecContext
*c
) {
1719 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1721 if (!c
->private_devices
)
1724 if (skip_seccomp_unavailable(u
, "PrivateDevices="))
1727 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_RAW_IO
, SCMP_ACT_ERRNO(EPERM
), false);
1730 static int apply_restrict_namespaces(const Unit
*u
, const ExecContext
*c
) {
1734 if (!exec_context_restrict_namespaces_set(c
))
1737 if (skip_seccomp_unavailable(u
, "RestrictNamespaces="))
1740 return seccomp_restrict_namespaces(c
->restrict_namespaces
);
1743 static int apply_lock_personality(const Unit
* u
, const ExecContext
*c
) {
1744 unsigned long personality
;
1750 if (!c
->lock_personality
)
1753 if (skip_seccomp_unavailable(u
, "LockPersonality="))
1756 personality
= c
->personality
;
1758 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1759 if (personality
== PERSONALITY_INVALID
) {
1761 r
= opinionated_personality(&personality
);
1766 return seccomp_lock_personality(personality
);
1772 static int apply_restrict_filesystems(Unit
*u
, const ExecContext
*c
) {
1776 if (!exec_context_restrict_filesystems_set(c
))
1779 if (!u
->manager
->restrict_fs
) {
1780 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1781 log_unit_debug(u
, "LSM BPF not supported, skipping RestrictFileSystems=");
1785 return lsm_bpf_unit_restrict_filesystems(u
, c
->restrict_filesystems
, c
->restrict_filesystems_allow_list
);
1789 static int apply_protect_hostname(const Unit
*u
, const ExecContext
*c
, int *ret_exit_status
) {
1793 if (!c
->protect_hostname
)
1796 if (ns_type_supported(NAMESPACE_UTS
)) {
1797 if (unshare(CLONE_NEWUTS
) < 0) {
1798 if (!ERRNO_IS_NOT_SUPPORTED(errno
) && !ERRNO_IS_PRIVILEGE(errno
)) {
1799 *ret_exit_status
= EXIT_NAMESPACE
;
1800 return log_unit_error_errno(u
, errno
, "Failed to set up UTS namespacing: %m");
1803 log_unit_warning(u
, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1806 log_unit_warning(u
, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1811 if (skip_seccomp_unavailable(u
, "ProtectHostname="))
1814 r
= seccomp_protect_hostname();
1816 *ret_exit_status
= EXIT_SECCOMP
;
1817 return log_unit_error_errno(u
, r
, "Failed to apply hostname restrictions: %m");
1824 static void do_idle_pipe_dance(int idle_pipe
[static 4]) {
1827 idle_pipe
[1] = safe_close(idle_pipe
[1]);
1828 idle_pipe
[2] = safe_close(idle_pipe
[2]);
1830 if (idle_pipe
[0] >= 0) {
1833 r
= fd_wait_for_event(idle_pipe
[0], POLLHUP
, IDLE_TIMEOUT_USEC
);
1835 if (idle_pipe
[3] >= 0 && r
== 0 /* timeout */) {
1838 /* Signal systemd that we are bored and want to continue. */
1839 n
= write(idle_pipe
[3], "x", 1);
1841 /* Wait for systemd to react to the signal above. */
1842 (void) fd_wait_for_event(idle_pipe
[0], POLLHUP
, IDLE_TIMEOUT2_USEC
);
1845 idle_pipe
[0] = safe_close(idle_pipe
[0]);
1849 idle_pipe
[3] = safe_close(idle_pipe
[3]);
1852 static const char *exec_directory_env_name_to_string(ExecDirectoryType t
);
1854 static int build_environment(
1856 const ExecContext
*c
,
1857 const ExecParameters
*p
,
1858 const CGroupContext
*cgroup_context
,
1862 const char *username
,
1864 dev_t journal_stream_dev
,
1865 ino_t journal_stream_ino
,
1866 const char *memory_pressure_path
,
1869 _cleanup_strv_free_
char **our_env
= NULL
;
1879 #define N_ENV_VARS 19
1880 our_env
= new0(char*, N_ENV_VARS
+ _EXEC_DIRECTORY_TYPE_MAX
);
1885 _cleanup_free_
char *joined
= NULL
;
1887 if (asprintf(&x
, "LISTEN_PID="PID_FMT
, getpid_cached()) < 0)
1889 our_env
[n_env
++] = x
;
1891 if (asprintf(&x
, "LISTEN_FDS=%zu", n_fds
) < 0)
1893 our_env
[n_env
++] = x
;
1895 joined
= strv_join(fdnames
, ":");
1899 x
= strjoin("LISTEN_FDNAMES=", joined
);
1902 our_env
[n_env
++] = x
;
1905 if ((p
->flags
& EXEC_SET_WATCHDOG
) && p
->watchdog_usec
> 0) {
1906 if (asprintf(&x
, "WATCHDOG_PID="PID_FMT
, getpid_cached()) < 0)
1908 our_env
[n_env
++] = x
;
1910 if (asprintf(&x
, "WATCHDOG_USEC="USEC_FMT
, p
->watchdog_usec
) < 0)
1912 our_env
[n_env
++] = x
;
1915 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1916 * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1917 * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1918 if (p
->flags
& EXEC_NSS_DYNAMIC_BYPASS
) {
1919 x
= strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
1922 our_env
[n_env
++] = x
;
1926 x
= strjoin("HOME=", home
);
1930 path_simplify(x
+ 5);
1931 our_env
[n_env
++] = x
;
1935 x
= strjoin("LOGNAME=", username
);
1938 our_env
[n_env
++] = x
;
1940 x
= strjoin("USER=", username
);
1943 our_env
[n_env
++] = x
;
1947 x
= strjoin("SHELL=", shell
);
1951 path_simplify(x
+ 6);
1952 our_env
[n_env
++] = x
;
1955 if (!sd_id128_is_null(u
->invocation_id
)) {
1956 if (asprintf(&x
, "INVOCATION_ID=" SD_ID128_FORMAT_STR
, SD_ID128_FORMAT_VAL(u
->invocation_id
)) < 0)
1959 our_env
[n_env
++] = x
;
1962 if (exec_context_needs_term(c
)) {
1963 _cleanup_free_
char *cmdline
= NULL
;
1964 const char *tty_path
, *term
= NULL
;
1966 tty_path
= exec_context_tty_path(c
);
1968 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1969 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1970 * container manager passes to PID 1 ends up all the way in the console login shown. */
1972 if (path_equal_ptr(tty_path
, "/dev/console") && getppid() == 1)
1973 term
= getenv("TERM");
1974 else if (tty_path
&& in_charset(skip_dev_prefix(tty_path
), ALPHANUMERICAL
)) {
1975 _cleanup_free_
char *key
= NULL
;
1977 key
= strjoin("systemd.tty.term.", skip_dev_prefix(tty_path
));
1981 r
= proc_cmdline_get_key(key
, 0, &cmdline
);
1983 log_debug_errno(r
, "Failed to read %s from kernel cmdline, ignoring: %m", key
);
1989 term
= default_term_for_tty(tty_path
);
1991 x
= strjoin("TERM=", term
);
1994 our_env
[n_env
++] = x
;
1997 if (journal_stream_dev
!= 0 && journal_stream_ino
!= 0) {
1998 if (asprintf(&x
, "JOURNAL_STREAM=" DEV_FMT
":" INO_FMT
, journal_stream_dev
, journal_stream_ino
) < 0)
2001 our_env
[n_env
++] = x
;
2004 if (c
->log_namespace
) {
2005 x
= strjoin("LOG_NAMESPACE=", c
->log_namespace
);
2009 our_env
[n_env
++] = x
;
2012 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2013 _cleanup_free_
char *joined
= NULL
;
2019 if (c
->directories
[t
].n_items
== 0)
2022 n
= exec_directory_env_name_to_string(t
);
2026 for (size_t i
= 0; i
< c
->directories
[t
].n_items
; i
++) {
2027 _cleanup_free_
char *prefixed
= NULL
;
2029 prefixed
= path_join(p
->prefix
[t
], c
->directories
[t
].items
[i
].path
);
2033 if (!strextend_with_separator(&joined
, ":", prefixed
))
2037 x
= strjoin(n
, "=", joined
);
2041 our_env
[n_env
++] = x
;
2044 if (exec_context_has_credentials(c
) && p
->prefix
[EXEC_DIRECTORY_RUNTIME
]) {
2045 x
= strjoin("CREDENTIALS_DIRECTORY=", p
->prefix
[EXEC_DIRECTORY_RUNTIME
], "/credentials/", u
->id
);
2049 our_env
[n_env
++] = x
;
2052 if (asprintf(&x
, "SYSTEMD_EXEC_PID=" PID_FMT
, getpid_cached()) < 0)
2055 our_env
[n_env
++] = x
;
2057 if (memory_pressure_path
) {
2058 x
= strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path
);
2062 our_env
[n_env
++] = x
;
2064 if (cgroup_context
&& !path_equal(memory_pressure_path
, "/dev/null")) {
2065 _cleanup_free_
char *b
= NULL
, *e
= NULL
;
2067 if (asprintf(&b
, "%s " USEC_FMT
" " USEC_FMT
,
2068 MEMORY_PRESSURE_DEFAULT_TYPE
,
2069 cgroup_context
->memory_pressure_threshold_usec
== USEC_INFINITY
? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC
:
2070 CLAMP(cgroup_context
->memory_pressure_threshold_usec
, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC
),
2071 MEMORY_PRESSURE_DEFAULT_WINDOW_USEC
) < 0)
2074 if (base64mem(b
, strlen(b
) + 1, &e
) < 0)
2077 x
= strjoin("MEMORY_PRESSURE_WRITE=", e
);
2081 our_env
[n_env
++] = x
;
2085 assert(n_env
< N_ENV_VARS
+ _EXEC_DIRECTORY_TYPE_MAX
);
2088 *ret
= TAKE_PTR(our_env
);
2093 static int build_pass_environment(const ExecContext
*c
, char ***ret
) {
2094 _cleanup_strv_free_
char **pass_env
= NULL
;
2097 STRV_FOREACH(i
, c
->pass_environment
) {
2098 _cleanup_free_
char *x
= NULL
;
2104 x
= strjoin(*i
, "=", v
);
2108 if (!GREEDY_REALLOC(pass_env
, n_env
+ 2))
2111 pass_env
[n_env
++] = TAKE_PTR(x
);
2112 pass_env
[n_env
] = NULL
;
2115 *ret
= TAKE_PTR(pass_env
);
2120 bool exec_needs_network_namespace(const ExecContext
*context
) {
2123 return context
->private_network
|| context
->network_namespace_path
;
2126 static bool exec_needs_ephemeral(const ExecContext
*context
) {
2127 return (context
->root_image
|| context
->root_directory
) && context
->root_ephemeral
;
2130 static bool exec_needs_ipc_namespace(const ExecContext
*context
) {
2133 return context
->private_ipc
|| context
->ipc_namespace_path
;
2136 bool exec_needs_mount_namespace(
2137 const ExecContext
*context
,
2138 const ExecParameters
*params
,
2139 const ExecRuntime
*runtime
) {
2143 if (context
->root_image
)
2146 if (!strv_isempty(context
->read_write_paths
) ||
2147 !strv_isempty(context
->read_only_paths
) ||
2148 !strv_isempty(context
->inaccessible_paths
) ||
2149 !strv_isempty(context
->exec_paths
) ||
2150 !strv_isempty(context
->no_exec_paths
))
2153 if (context
->n_bind_mounts
> 0)
2156 if (context
->n_temporary_filesystems
> 0)
2159 if (context
->n_mount_images
> 0)
2162 if (context
->n_extension_images
> 0)
2165 if (!strv_isempty(context
->extension_directories
))
2168 if (!IN_SET(context
->mount_propagation_flag
, 0, MS_SHARED
))
2171 if (context
->private_tmp
&& runtime
&& runtime
->shared
&& (runtime
->shared
->tmp_dir
|| runtime
->shared
->var_tmp_dir
))
2174 if (context
->private_devices
||
2175 context
->private_mounts
> 0 ||
2176 (context
->private_mounts
< 0 && exec_needs_network_namespace(context
)) ||
2177 context
->protect_system
!= PROTECT_SYSTEM_NO
||
2178 context
->protect_home
!= PROTECT_HOME_NO
||
2179 context
->protect_kernel_tunables
||
2180 context
->protect_kernel_modules
||
2181 context
->protect_kernel_logs
||
2182 context
->protect_control_groups
||
2183 context
->protect_proc
!= PROTECT_PROC_DEFAULT
||
2184 context
->proc_subset
!= PROC_SUBSET_ALL
||
2185 exec_needs_ipc_namespace(context
))
2188 if (context
->root_directory
) {
2189 if (exec_context_get_effective_mount_apivfs(context
))
2192 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2193 if (params
&& !params
->prefix
[t
])
2196 if (context
->directories
[t
].n_items
> 0)
2201 if (context
->dynamic_user
&&
2202 (context
->directories
[EXEC_DIRECTORY_STATE
].n_items
> 0 ||
2203 context
->directories
[EXEC_DIRECTORY_CACHE
].n_items
> 0 ||
2204 context
->directories
[EXEC_DIRECTORY_LOGS
].n_items
> 0))
2207 if (context
->log_namespace
)
2213 static int setup_private_users(uid_t ouid
, gid_t ogid
, uid_t uid
, gid_t gid
) {
2214 _cleanup_free_
char *uid_map
= NULL
, *gid_map
= NULL
;
2215 _cleanup_close_pair_
int errno_pipe
[2] = PIPE_EBADF
;
2216 _cleanup_close_
int unshare_ready_fd
= -EBADF
;
2217 _cleanup_(sigkill_waitp
) pid_t pid
= 0;
2222 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2223 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2224 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2225 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2226 * which waits for the parent to create the new user namespace while staying in the original namespace. The
2227 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2228 * continues execution normally.
2229 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2230 * does not need CAP_SETUID to write the single line mapping to itself. */
2232 /* Can only set up multiple mappings with CAP_SETUID. */
2233 if (have_effective_cap(CAP_SETUID
) > 0 && uid
!= ouid
&& uid_is_valid(uid
))
2234 r
= asprintf(&uid_map
,
2235 UID_FMT
" " UID_FMT
" 1\n" /* Map $OUID → $OUID */
2236 UID_FMT
" " UID_FMT
" 1\n", /* Map $UID → $UID */
2237 ouid
, ouid
, uid
, uid
);
2239 r
= asprintf(&uid_map
,
2240 UID_FMT
" " UID_FMT
" 1\n", /* Map $OUID → $OUID */
2246 /* Can only set up multiple mappings with CAP_SETGID. */
2247 if (have_effective_cap(CAP_SETGID
) > 0 && gid
!= ogid
&& gid_is_valid(gid
))
2248 r
= asprintf(&gid_map
,
2249 GID_FMT
" " GID_FMT
" 1\n" /* Map $OGID → $OGID */
2250 GID_FMT
" " GID_FMT
" 1\n", /* Map $GID → $GID */
2251 ogid
, ogid
, gid
, gid
);
2253 r
= asprintf(&gid_map
,
2254 GID_FMT
" " GID_FMT
" 1\n", /* Map $OGID -> $OGID */
2260 /* Create a communication channel so that the parent can tell the child when it finished creating the user
2262 unshare_ready_fd
= eventfd(0, EFD_CLOEXEC
);
2263 if (unshare_ready_fd
< 0)
2266 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2268 if (pipe2(errno_pipe
, O_CLOEXEC
) < 0)
2271 r
= safe_fork("(sd-userns)", FORK_RESET_SIGNALS
|FORK_DEATHSIG
, &pid
);
2275 _cleanup_close_
int fd
= -EBADF
;
2279 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2280 * here, after the parent opened its own user namespace. */
2283 errno_pipe
[0] = safe_close(errno_pipe
[0]);
2285 /* Wait until the parent unshared the user namespace */
2286 if (read(unshare_ready_fd
, &c
, sizeof(c
)) < 0) {
2291 /* Disable the setgroups() system call in the child user namespace, for good. */
2292 a
= procfs_file_alloca(ppid
, "setgroups");
2293 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
2295 if (errno
!= ENOENT
) {
2300 /* If the file is missing the kernel is too old, let's continue anyway. */
2302 if (write(fd
, "deny\n", 5) < 0) {
2307 fd
= safe_close(fd
);
2310 /* First write the GID map */
2311 a
= procfs_file_alloca(ppid
, "gid_map");
2312 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
2317 if (write(fd
, gid_map
, strlen(gid_map
)) < 0) {
2321 fd
= safe_close(fd
);
2323 /* The write the UID map */
2324 a
= procfs_file_alloca(ppid
, "uid_map");
2325 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
2330 if (write(fd
, uid_map
, strlen(uid_map
)) < 0) {
2335 _exit(EXIT_SUCCESS
);
2338 (void) write(errno_pipe
[1], &r
, sizeof(r
));
2339 _exit(EXIT_FAILURE
);
2342 errno_pipe
[1] = safe_close(errno_pipe
[1]);
2344 if (unshare(CLONE_NEWUSER
) < 0)
2347 /* Let the child know that the namespace is ready now */
2348 if (write(unshare_ready_fd
, &c
, sizeof(c
)) < 0)
2351 /* Try to read an error code from the child */
2352 n
= read(errno_pipe
[0], &r
, sizeof(r
));
2355 if (n
== sizeof(r
)) { /* an error code was sent to us */
2360 if (n
!= 0) /* on success we should have read 0 bytes */
2363 r
= wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid
), 0);
2366 if (r
!= EXIT_SUCCESS
) /* If something strange happened with the child, let's consider this fatal, too */
2372 static bool exec_directory_is_private(const ExecContext
*context
, ExecDirectoryType type
) {
2375 if (!context
->dynamic_user
)
2378 if (type
== EXEC_DIRECTORY_CONFIGURATION
)
2381 if (type
== EXEC_DIRECTORY_RUNTIME
&& context
->runtime_directory_preserve_mode
== EXEC_PRESERVE_NO
)
2387 static int create_many_symlinks(const char *root
, const char *source
, char **symlinks
) {
2388 _cleanup_free_
char *src_abs
= NULL
;
2393 src_abs
= path_join(root
, source
);
2397 STRV_FOREACH(dst
, symlinks
) {
2398 _cleanup_free_
char *dst_abs
= NULL
;
2400 dst_abs
= path_join(root
, *dst
);
2404 r
= mkdir_parents_label(dst_abs
, 0755);
2408 r
= symlink_idempotent(src_abs
, dst_abs
, true);
2416 static int setup_exec_directory(
2418 const ExecContext
*context
,
2419 const ExecParameters
*params
,
2422 ExecDirectoryType type
,
2423 bool needs_mount_namespace
,
2426 static const int exit_status_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
2427 [EXEC_DIRECTORY_RUNTIME
] = EXIT_RUNTIME_DIRECTORY
,
2428 [EXEC_DIRECTORY_STATE
] = EXIT_STATE_DIRECTORY
,
2429 [EXEC_DIRECTORY_CACHE
] = EXIT_CACHE_DIRECTORY
,
2430 [EXEC_DIRECTORY_LOGS
] = EXIT_LOGS_DIRECTORY
,
2431 [EXEC_DIRECTORY_CONFIGURATION
] = EXIT_CONFIGURATION_DIRECTORY
,
2437 assert(type
>= 0 && type
< _EXEC_DIRECTORY_TYPE_MAX
);
2438 assert(exit_status
);
2440 if (!params
->prefix
[type
])
2443 if (params
->flags
& EXEC_CHOWN_DIRECTORIES
) {
2444 if (!uid_is_valid(uid
))
2446 if (!gid_is_valid(gid
))
2450 for (size_t i
= 0; i
< context
->directories
[type
].n_items
; i
++) {
2451 _cleanup_free_
char *p
= NULL
, *pp
= NULL
;
2453 p
= path_join(params
->prefix
[type
], context
->directories
[type
].items
[i
].path
);
2459 r
= mkdir_parents_label(p
, 0755);
2463 if (IN_SET(type
, EXEC_DIRECTORY_STATE
, EXEC_DIRECTORY_LOGS
) && params
->runtime_scope
== RUNTIME_SCOPE_USER
) {
2465 /* If we are in user mode, and a configuration directory exists but a state directory
2466 * doesn't exist, then we likely are upgrading from an older systemd version that
2467 * didn't know the more recent addition to the xdg-basedir spec: the $XDG_STATE_HOME
2468 * directory. In older systemd versions EXEC_DIRECTORY_STATE was aliased to
2469 * EXEC_DIRECTORY_CONFIGURATION, with the advent of $XDG_STATE_HOME is is now
2470 * separated. If a service has both dirs configured but only the configuration dir
2471 * exists and the state dir does not, we assume we are looking at an update
2472 * situation. Hence, create a compatibility symlink, so that all expectations are
2475 * (We also do something similar with the log directory, which still doesn't exist in
2476 * the xdg basedir spec. We'll make it a subdir of the state dir.) */
2478 /* this assumes the state dir is always created before the configuration dir */
2479 assert_cc(EXEC_DIRECTORY_STATE
< EXEC_DIRECTORY_LOGS
);
2480 assert_cc(EXEC_DIRECTORY_LOGS
< EXEC_DIRECTORY_CONFIGURATION
);
2482 r
= laccess(p
, F_OK
);
2484 _cleanup_free_
char *q
= NULL
;
2486 /* OK, we know that the state dir does not exist. Let's see if the dir exists
2487 * under the configuration hierarchy. */
2489 if (type
== EXEC_DIRECTORY_STATE
)
2490 q
= path_join(params
->prefix
[EXEC_DIRECTORY_CONFIGURATION
], context
->directories
[type
].items
[i
].path
);
2491 else if (type
== EXEC_DIRECTORY_LOGS
)
2492 q
= path_join(params
->prefix
[EXEC_DIRECTORY_CONFIGURATION
], "log", context
->directories
[type
].items
[i
].path
);
2494 assert_not_reached();
2500 r
= laccess(q
, F_OK
);
2502 /* It does exist! This hence looks like an update. Symlink the
2503 * configuration directory into the state directory. */
2505 r
= symlink_idempotent(q
, p
, /* make_relative= */ true);
2509 log_unit_notice(u
, "Unit state directory %s missing but matching configuration directory %s exists, assuming update from systemd 253 or older, creating compatibility symlink.", p
, q
);
2511 } else if (r
!= -ENOENT
)
2512 log_unit_warning_errno(u
, r
, "Unable to detect whether unit configuration directory '%s' exists, assuming not: %m", q
);
2515 log_unit_warning_errno(u
, r
, "Unable to detect whether unit state directory '%s' is missing, assuming it is: %m", p
);
2518 if (exec_directory_is_private(context
, type
)) {
2519 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2520 * case we want to avoid leaving a directory around fully accessible that is owned by
2521 * a dynamic user whose UID is later on reused. To lock this down we use the same
2522 * trick used by container managers to prohibit host users to get access to files of
2523 * the same UID in containers: we place everything inside a directory that has an
2524 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2525 * for unprivileged host code. We then use fs namespacing to make this directory
2526 * permeable for the service itself.
2528 * Specifically: for a service which wants a special directory "foo/" we first create
2529 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2530 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2531 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2532 * unprivileged host users can't look into it. Inside of the namespace of the unit
2533 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2534 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2535 * for the service and making sure it only gets access to the dirs it needs but no
2536 * others. Tricky? Yes, absolutely, but it works!
2538 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2539 * to be owned by the service itself.
2541 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2542 * for sharing files or sockets with other services. */
2544 pp
= path_join(params
->prefix
[type
], "private");
2550 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2551 r
= mkdir_safe_label(pp
, 0700, 0, 0, MKDIR_WARN_MODE
);
2555 if (!path_extend(&pp
, context
->directories
[type
].items
[i
].path
)) {
2560 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2561 r
= mkdir_parents_label(pp
, 0755);
2565 if (is_dir(p
, false) > 0 &&
2566 (laccess(pp
, F_OK
) == -ENOENT
)) {
2568 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2569 * it over. Most likely the service has been upgraded from one that didn't use
2570 * DynamicUser=1, to one that does. */
2572 log_unit_info(u
, "Found pre-existing public %s= directory %s, migrating to %s.\n"
2573 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2574 exec_directory_type_to_string(type
), p
, pp
);
2576 r
= RET_NERRNO(rename(p
, pp
));
2580 /* Otherwise, create the actual directory for the service */
2582 r
= mkdir_label(pp
, context
->directories
[type
].mode
);
2583 if (r
< 0 && r
!= -EEXIST
)
2587 if (!context
->directories
[type
].items
[i
].only_create
) {
2588 /* And link it up from the original place.
2590 * 1) If a mount namespace is going to be used, then this symlink remains on
2591 * the host, and a new one for the child namespace will be created later.
2592 * 2) It is not necessary to create this symlink when one of its parent
2593 * directories is specified and already created. E.g.
2594 * StateDirectory=foo foo/bar
2595 * In that case, the inode points to pp and p for "foo/bar" are the same:
2596 * pp = "/var/lib/private/foo/bar"
2597 * p = "/var/lib/foo/bar"
2598 * and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2599 * we do not need to create the symlink, but we cannot create the symlink.
2600 * See issue #24783. */
2601 r
= symlink_idempotent(pp
, p
, true);
2607 _cleanup_free_
char *target
= NULL
;
2609 if (type
!= EXEC_DIRECTORY_CONFIGURATION
&&
2610 readlink_and_make_absolute(p
, &target
) >= 0) {
2611 _cleanup_free_
char *q
= NULL
, *q_resolved
= NULL
, *target_resolved
= NULL
;
2613 /* This already exists and is a symlink? Interesting. Maybe it's one created
2614 * by DynamicUser=1 (see above)?
2616 * We do this for all directory types except for ConfigurationDirectory=,
2617 * since they all support the private/ symlink logic at least in some
2618 * configurations, see above. */
2620 r
= chase(target
, NULL
, 0, &target_resolved
, NULL
);
2624 q
= path_join(params
->prefix
[type
], "private", context
->directories
[type
].items
[i
].path
);
2630 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2631 r
= chase(q
, NULL
, CHASE_NONEXISTENT
, &q_resolved
, NULL
);
2635 if (path_equal(q_resolved
, target_resolved
)) {
2637 /* Hmm, apparently DynamicUser= was once turned on for this service,
2638 * but is no longer. Let's move the directory back up. */
2640 log_unit_info(u
, "Found pre-existing private %s= directory %s, migrating to %s.\n"
2641 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2642 exec_directory_type_to_string(type
), q
, p
);
2644 r
= RET_NERRNO(unlink(p
));
2648 r
= RET_NERRNO(rename(q
, p
));
2654 r
= mkdir_label(p
, context
->directories
[type
].mode
);
2659 if (type
== EXEC_DIRECTORY_CONFIGURATION
) {
2662 /* Don't change the owner/access mode of the configuration directory,
2663 * as in the common case it is not written to by a service, and shall
2664 * not be writable. */
2666 r
= RET_NERRNO(stat(p
, &st
));
2670 /* Still complain if the access mode doesn't match */
2671 if (((st
.st_mode
^ context
->directories
[type
].mode
) & 07777) != 0)
2672 log_unit_warning(u
, "%s \'%s\' already exists but the mode is different. "
2673 "(File system: %o %sMode: %o)",
2674 exec_directory_type_to_string(type
), context
->directories
[type
].items
[i
].path
,
2675 st
.st_mode
& 07777, exec_directory_type_to_string(type
), context
->directories
[type
].mode
& 07777);
2682 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2683 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2684 * current UID/GID ownership.) */
2685 r
= chmod_and_chown(pp
?: p
, context
->directories
[type
].mode
, UID_INVALID
, GID_INVALID
);
2689 /* Skip the rest (which deals with ownership) in user mode, since ownership changes are not
2690 * available to user code anyway */
2691 if (params
->runtime_scope
!= RUNTIME_SCOPE_SYSTEM
)
2694 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2695 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2696 * assignments to exist. */
2697 r
= path_chown_recursive(pp
?: p
, uid
, gid
, context
->dynamic_user
? 01777 : 07777, AT_SYMLINK_FOLLOW
);
2702 /* If we are not going to run in a namespace, set up the symlinks - otherwise
2703 * they are set up later, to allow configuring empty var/run/etc. */
2704 if (!needs_mount_namespace
)
2705 for (size_t i
= 0; i
< context
->directories
[type
].n_items
; i
++) {
2706 r
= create_many_symlinks(params
->prefix
[type
],
2707 context
->directories
[type
].items
[i
].path
,
2708 context
->directories
[type
].items
[i
].symlinks
);
2716 *exit_status
= exit_status_table
[type
];
2721 static int setup_smack(
2722 const Manager
*manager
,
2723 const ExecContext
*context
,
2724 int executable_fd
) {
2728 assert(executable_fd
>= 0);
2730 if (context
->smack_process_label
) {
2731 r
= mac_smack_apply_pid(0, context
->smack_process_label
);
2734 } else if (manager
->default_smack_process_label
) {
2735 _cleanup_free_
char *exec_label
= NULL
;
2737 r
= mac_smack_read_fd(executable_fd
, SMACK_ATTR_EXEC
, &exec_label
);
2738 if (r
< 0 && !ERRNO_IS_XATTR_ABSENT(r
))
2741 r
= mac_smack_apply_pid(0, exec_label
?: manager
->default_smack_process_label
);
2750 static int compile_bind_mounts(
2751 const ExecContext
*context
,
2752 const ExecParameters
*params
,
2753 BindMount
**ret_bind_mounts
,
2754 size_t *ret_n_bind_mounts
,
2755 char ***ret_empty_directories
) {
2757 _cleanup_strv_free_
char **empty_directories
= NULL
;
2758 BindMount
*bind_mounts
= NULL
;
2764 assert(ret_bind_mounts
);
2765 assert(ret_n_bind_mounts
);
2766 assert(ret_empty_directories
);
2768 CLEANUP_ARRAY(bind_mounts
, h
, bind_mount_free_many
);
2770 n
= context
->n_bind_mounts
;
2771 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2772 if (!params
->prefix
[t
])
2775 for (size_t i
= 0; i
< context
->directories
[t
].n_items
; i
++)
2776 n
+= !context
->directories
[t
].items
[i
].only_create
;
2780 *ret_bind_mounts
= NULL
;
2781 *ret_n_bind_mounts
= 0;
2782 *ret_empty_directories
= NULL
;
2786 bind_mounts
= new(BindMount
, n
);
2790 for (size_t i
= 0; i
< context
->n_bind_mounts
; i
++) {
2791 BindMount
*item
= context
->bind_mounts
+ i
;
2792 _cleanup_free_
char *s
= NULL
, *d
= NULL
;
2794 s
= strdup(item
->source
);
2798 d
= strdup(item
->destination
);
2802 bind_mounts
[h
++] = (BindMount
) {
2803 .source
= TAKE_PTR(s
),
2804 .destination
= TAKE_PTR(d
),
2805 .read_only
= item
->read_only
,
2806 .recursive
= item
->recursive
,
2807 .ignore_enoent
= item
->ignore_enoent
,
2811 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2812 if (!params
->prefix
[t
])
2815 if (context
->directories
[t
].n_items
== 0)
2818 if (exec_directory_is_private(context
, t
) &&
2819 !exec_context_with_rootfs(context
)) {
2822 /* So this is for a dynamic user, and we need to make sure the process can access its own
2823 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2824 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2826 private_root
= path_join(params
->prefix
[t
], "private");
2830 r
= strv_consume(&empty_directories
, private_root
);
2835 for (size_t i
= 0; i
< context
->directories
[t
].n_items
; i
++) {
2836 _cleanup_free_
char *s
= NULL
, *d
= NULL
;
2838 /* When one of the parent directories is in the list, we cannot create the symlink
2839 * for the child directory. See also the comments in setup_exec_directory(). */
2840 if (context
->directories
[t
].items
[i
].only_create
)
2843 if (exec_directory_is_private(context
, t
))
2844 s
= path_join(params
->prefix
[t
], "private", context
->directories
[t
].items
[i
].path
);
2846 s
= path_join(params
->prefix
[t
], context
->directories
[t
].items
[i
].path
);
2850 if (exec_directory_is_private(context
, t
) &&
2851 exec_context_with_rootfs(context
))
2852 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2853 * directory is not created on the root directory. So, let's bind-mount the directory
2854 * on the 'non-private' place. */
2855 d
= path_join(params
->prefix
[t
], context
->directories
[t
].items
[i
].path
);
2861 bind_mounts
[h
++] = (BindMount
) {
2862 .source
= TAKE_PTR(s
),
2863 .destination
= TAKE_PTR(d
),
2865 .nosuid
= context
->dynamic_user
, /* don't allow suid/sgid when DynamicUser= is on */
2867 .ignore_enoent
= false,
2874 *ret_bind_mounts
= TAKE_PTR(bind_mounts
);
2875 *ret_n_bind_mounts
= n
;
2876 *ret_empty_directories
= TAKE_PTR(empty_directories
);
2881 /* ret_symlinks will contain a list of pairs src:dest that describes
2882 * the symlinks to create later on. For example, the symlinks needed
2883 * to safely give private directories to DynamicUser=1 users. */
2884 static int compile_symlinks(
2885 const ExecContext
*context
,
2886 const ExecParameters
*params
,
2887 bool setup_os_release_symlink
,
2888 char ***ret_symlinks
) {
2890 _cleanup_strv_free_
char **symlinks
= NULL
;
2895 assert(ret_symlinks
);
2897 for (ExecDirectoryType dt
= 0; dt
< _EXEC_DIRECTORY_TYPE_MAX
; dt
++) {
2898 for (size_t i
= 0; i
< context
->directories
[dt
].n_items
; i
++) {
2899 _cleanup_free_
char *private_path
= NULL
, *path
= NULL
;
2901 STRV_FOREACH(symlink
, context
->directories
[dt
].items
[i
].symlinks
) {
2902 _cleanup_free_
char *src_abs
= NULL
, *dst_abs
= NULL
;
2904 src_abs
= path_join(params
->prefix
[dt
], context
->directories
[dt
].items
[i
].path
);
2905 dst_abs
= path_join(params
->prefix
[dt
], *symlink
);
2906 if (!src_abs
|| !dst_abs
)
2909 r
= strv_consume_pair(&symlinks
, TAKE_PTR(src_abs
), TAKE_PTR(dst_abs
));
2914 if (!exec_directory_is_private(context
, dt
) ||
2915 exec_context_with_rootfs(context
) ||
2916 context
->directories
[dt
].items
[i
].only_create
)
2919 private_path
= path_join(params
->prefix
[dt
], "private", context
->directories
[dt
].items
[i
].path
);
2923 path
= path_join(params
->prefix
[dt
], context
->directories
[dt
].items
[i
].path
);
2927 r
= strv_consume_pair(&symlinks
, TAKE_PTR(private_path
), TAKE_PTR(path
));
2933 /* We make the host's os-release available via a symlink, so that we can copy it atomically
2934 * and readers will never get a half-written version. Note that, while the paths specified here are
2935 * absolute, when they are processed in namespace.c they will be made relative automatically, i.e.:
2936 * 'os-release -> .os-release-stage/os-release' is what will be created. */
2937 if (setup_os_release_symlink
) {
2938 r
= strv_extend(&symlinks
, "/run/host/.os-release-stage/os-release");
2942 r
= strv_extend(&symlinks
, "/run/host/os-release");
2947 *ret_symlinks
= TAKE_PTR(symlinks
);
2952 static bool insist_on_sandboxing(
2953 const ExecContext
*context
,
2954 const char *root_dir
,
2955 const char *root_image
,
2956 const BindMount
*bind_mounts
,
2957 size_t n_bind_mounts
) {
2960 assert(n_bind_mounts
== 0 || bind_mounts
);
2962 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
2963 * would alter the view on the file system beyond making things read-only or invisible, i.e. would
2964 * rearrange stuff in a way we cannot ignore gracefully. */
2966 if (context
->n_temporary_filesystems
> 0)
2969 if (root_dir
|| root_image
)
2972 if (context
->n_mount_images
> 0)
2975 if (context
->dynamic_user
)
2978 if (context
->n_extension_images
> 0 || !strv_isempty(context
->extension_directories
))
2981 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
2983 for (size_t i
= 0; i
< n_bind_mounts
; i
++)
2984 if (!path_equal(bind_mounts
[i
].source
, bind_mounts
[i
].destination
))
2987 if (context
->log_namespace
)
2993 static int setup_ephemeral(const ExecContext
*context
, ExecRuntime
*runtime
) {
2994 _cleanup_close_
int fd
= -EBADF
;
2997 if (!runtime
|| !runtime
->ephemeral_copy
)
3000 r
= posix_lock(runtime
->ephemeral_storage_socket
[0], LOCK_EX
);
3002 return log_debug_errno(r
, "Failed to lock ephemeral storage socket: %m");
3004 CLEANUP_POSIX_UNLOCK(runtime
->ephemeral_storage_socket
[0]);
3006 fd
= receive_one_fd(runtime
->ephemeral_storage_socket
[0], MSG_PEEK
|MSG_DONTWAIT
);
3008 /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */
3012 return log_debug_errno(fd
, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
3014 log_debug("Making ephemeral snapshot of %s to %s",
3015 context
->root_image
?: context
->root_directory
, runtime
->ephemeral_copy
);
3017 if (context
->root_image
)
3018 fd
= copy_file(context
->root_image
, runtime
->ephemeral_copy
, O_EXCL
, 0600,
3019 COPY_LOCK_BSD
|COPY_REFLINK
|COPY_CRTIME
);
3021 fd
= btrfs_subvol_snapshot_at(AT_FDCWD
, context
->root_directory
,
3022 AT_FDCWD
, runtime
->ephemeral_copy
,
3023 BTRFS_SNAPSHOT_FALLBACK_COPY
|
3024 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY
|
3025 BTRFS_SNAPSHOT_RECURSIVE
|
3026 BTRFS_SNAPSHOT_LOCK_BSD
);
3028 return log_debug_errno(fd
, "Failed to snapshot %s to %s: %m",
3029 context
->root_image
?: context
->root_directory
, runtime
->ephemeral_copy
);
3031 if (context
->root_image
) {
3032 /* A root image might be subject to lots of random writes so let's try to disable COW on it
3033 * which tends to not perform well in combination with lots of random writes.
3035 * Note: btrfs actually isn't impressed by us setting the flag after making the reflink'ed
3036 * copy, but we at least want to make the intention clear.
3038 r
= chattr_fd(fd
, FS_NOCOW_FL
, FS_NOCOW_FL
, NULL
);
3040 log_debug_errno(fd
, "Failed to disable copy-on-write for %s, ignoring: %m", runtime
->ephemeral_copy
);
3043 r
= send_one_fd(runtime
->ephemeral_storage_socket
[1], fd
, MSG_DONTWAIT
);
3045 return log_debug_errno(r
, "Failed to queue file descriptor on ephemeral storage socket: %m");
3050 static int verity_settings_prepare(
3051 VeritySettings
*verity
,
3052 const char *root_image
,
3053 const void *root_hash
,
3054 size_t root_hash_size
,
3055 const char *root_hash_path
,
3056 const void *root_hash_sig
,
3057 size_t root_hash_sig_size
,
3058 const char *root_hash_sig_path
,
3059 const char *verity_data_path
) {
3068 d
= memdup(root_hash
, root_hash_size
);
3072 free_and_replace(verity
->root_hash
, d
);
3073 verity
->root_hash_size
= root_hash_size
;
3074 verity
->designator
= PARTITION_ROOT
;
3077 if (root_hash_sig
) {
3080 d
= memdup(root_hash_sig
, root_hash_sig_size
);
3084 free_and_replace(verity
->root_hash_sig
, d
);
3085 verity
->root_hash_sig_size
= root_hash_sig_size
;
3086 verity
->designator
= PARTITION_ROOT
;
3089 if (verity_data_path
) {
3090 r
= free_and_strdup(&verity
->data_path
, verity_data_path
);
3095 r
= verity_settings_load(
3099 root_hash_sig_path
);
3101 return log_debug_errno(r
, "Failed to load root hash: %m");
3106 static int apply_mount_namespace(
3108 ExecCommandFlags command_flags
,
3109 const ExecContext
*context
,
3110 const ExecParameters
*params
,
3111 ExecRuntime
*runtime
,
3112 const char *memory_pressure_path
,
3113 char **error_path
) {
3115 _cleanup_(verity_settings_done
) VeritySettings verity
= VERITY_SETTINGS_DEFAULT
;
3116 _cleanup_strv_free_
char **empty_directories
= NULL
, **symlinks
= NULL
,
3117 **read_write_paths_cleanup
= NULL
;
3118 _cleanup_free_
char *creds_path
= NULL
, *incoming_dir
= NULL
, *propagate_dir
= NULL
,
3119 *extension_dir
= NULL
, *host_os_release_stage
= NULL
;
3120 const char *root_dir
= NULL
, *root_image
= NULL
, *tmp_dir
= NULL
, *var_tmp_dir
= NULL
;
3121 char **read_write_paths
;
3122 NamespaceInfo ns_info
;
3123 bool needs_sandboxing
, setup_os_release_symlink
;
3124 BindMount
*bind_mounts
= NULL
;
3125 size_t n_bind_mounts
= 0;
3130 CLEANUP_ARRAY(bind_mounts
, n_bind_mounts
, bind_mount_free_many
);
3132 if (params
->flags
& EXEC_APPLY_CHROOT
) {
3133 r
= setup_ephemeral(context
, runtime
);
3137 if (context
->root_image
)
3138 root_image
= (runtime
? runtime
->ephemeral_copy
: NULL
) ?: context
->root_image
;
3140 root_dir
= (runtime
? runtime
->ephemeral_copy
: NULL
) ?: context
->root_directory
;
3143 r
= compile_bind_mounts(context
, params
, &bind_mounts
, &n_bind_mounts
, &empty_directories
);
3147 /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
3148 * service will need to write to it in order to start the notifications. */
3149 if (context
->protect_control_groups
&& memory_pressure_path
&& !streq(memory_pressure_path
, "/dev/null")) {
3150 read_write_paths_cleanup
= strv_copy(context
->read_write_paths
);
3151 if (!read_write_paths_cleanup
)
3154 r
= strv_extend(&read_write_paths_cleanup
, memory_pressure_path
);
3158 read_write_paths
= read_write_paths_cleanup
;
3160 read_write_paths
= context
->read_write_paths
;
3162 needs_sandboxing
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command_flags
& EXEC_COMMAND_FULLY_PRIVILEGED
);
3163 if (needs_sandboxing
) {
3164 /* The runtime struct only contains the parent of the private /tmp,
3165 * which is non-accessible to world users. Inside of it there's a /tmp
3166 * that is sticky, and that's the one we want to use here.
3167 * This does not apply when we are using /run/systemd/empty as fallback. */
3169 if (context
->private_tmp
&& runtime
&& runtime
->shared
) {
3170 if (streq_ptr(runtime
->shared
->tmp_dir
, RUN_SYSTEMD_EMPTY
))
3171 tmp_dir
= runtime
->shared
->tmp_dir
;
3172 else if (runtime
->shared
->tmp_dir
)
3173 tmp_dir
= strjoina(runtime
->shared
->tmp_dir
, "/tmp");
3175 if (streq_ptr(runtime
->shared
->var_tmp_dir
, RUN_SYSTEMD_EMPTY
))
3176 var_tmp_dir
= runtime
->shared
->var_tmp_dir
;
3177 else if (runtime
->shared
->var_tmp_dir
)
3178 var_tmp_dir
= strjoina(runtime
->shared
->var_tmp_dir
, "/tmp");
3181 ns_info
= (NamespaceInfo
) {
3182 .ignore_protect_paths
= false,
3183 .private_dev
= context
->private_devices
,
3184 .protect_control_groups
= context
->protect_control_groups
,
3185 .protect_kernel_tunables
= context
->protect_kernel_tunables
,
3186 .protect_kernel_modules
= context
->protect_kernel_modules
,
3187 .protect_kernel_logs
= context
->protect_kernel_logs
,
3188 .protect_hostname
= context
->protect_hostname
,
3189 .mount_apivfs
= exec_context_get_effective_mount_apivfs(context
),
3190 .protect_home
= context
->protect_home
,
3191 .protect_system
= context
->protect_system
,
3192 .protect_proc
= context
->protect_proc
,
3193 .proc_subset
= context
->proc_subset
,
3194 .private_network
= exec_needs_network_namespace(context
),
3195 .private_ipc
= exec_needs_ipc_namespace(context
),
3196 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
3197 .mount_nosuid
= context
->no_new_privileges
&& !mac_selinux_use(),
3199 } else if (!context
->dynamic_user
&& root_dir
)
3201 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
3202 * sandbox info, otherwise enforce it, don't ignore protected paths and
3203 * fail if we are enable to apply the sandbox inside the mount namespace.
3205 ns_info
= (NamespaceInfo
) {
3206 .ignore_protect_paths
= true,
3209 ns_info
= (NamespaceInfo
) {};
3211 /* Symlinks (exec dirs, os-release) are set up after other mounts, before they are made read-only. */
3212 setup_os_release_symlink
= ns_info
.mount_apivfs
&& (root_dir
|| root_image
);
3213 r
= compile_symlinks(context
, params
, setup_os_release_symlink
, &symlinks
);
3217 if (context
->mount_propagation_flag
== MS_SHARED
)
3218 log_unit_debug(u
, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3220 if (exec_context_has_credentials(context
) &&
3221 params
->prefix
[EXEC_DIRECTORY_RUNTIME
] &&
3222 FLAGS_SET(params
->flags
, EXEC_WRITE_CREDENTIALS
)) {
3223 creds_path
= path_join(params
->prefix
[EXEC_DIRECTORY_RUNTIME
], "credentials", u
->id
);
3228 if (params
->runtime_scope
== RUNTIME_SCOPE_SYSTEM
) {
3229 propagate_dir
= path_join("/run/systemd/propagate/", u
->id
);
3233 incoming_dir
= strdup("/run/systemd/incoming");
3237 extension_dir
= strdup("/run/systemd/unit-extensions");
3241 /* If running under a different root filesystem, propagate the host's os-release. We make a
3242 * copy rather than just bind mounting it, so that it can be updated on soft-reboot. */
3243 if (setup_os_release_symlink
) {
3244 host_os_release_stage
= strdup("/run/systemd/propagate/.os-release-stage");
3245 if (!host_os_release_stage
)
3249 assert(params
->runtime_scope
== RUNTIME_SCOPE_USER
);
3251 if (asprintf(&extension_dir
, "/run/user/" UID_FMT
"/systemd/unit-extensions", geteuid()) < 0)
3254 if (setup_os_release_symlink
) {
3255 if (asprintf(&host_os_release_stage
,
3256 "/run/user/" UID_FMT
"/systemd/propagate/.os-release-stage",
3263 r
= verity_settings_prepare(
3266 context
->root_hash
, context
->root_hash_size
, context
->root_hash_path
,
3267 context
->root_hash_sig
, context
->root_hash_sig_size
, context
->root_hash_sig_path
,
3268 context
->root_verity
);
3273 r
= setup_namespace(
3276 context
->root_image_options
,
3277 context
->root_image_policy
?: &image_policy_service
,
3280 needs_sandboxing
? context
->read_only_paths
: NULL
,
3281 needs_sandboxing
? context
->inaccessible_paths
: NULL
,
3282 needs_sandboxing
? context
->exec_paths
: NULL
,
3283 needs_sandboxing
? context
->no_exec_paths
: NULL
,
3288 context
->temporary_filesystems
,
3289 context
->n_temporary_filesystems
,
3290 context
->mount_images
,
3291 context
->n_mount_images
,
3292 context
->mount_image_policy
?: &image_policy_service
,
3296 context
->log_namespace
,
3297 context
->mount_propagation_flag
,
3299 context
->extension_images
,
3300 context
->n_extension_images
,
3301 context
->extension_image_policy
?: &image_policy_sysext
,
3302 context
->extension_directories
,
3306 root_dir
|| root_image
? params
->notify_socket
: NULL
,
3307 host_os_release_stage
,
3310 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
3311 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
3312 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3313 * completely different execution environment. */
3315 if (insist_on_sandboxing(
3317 root_dir
, root_image
,
3320 return log_unit_debug_errno(u
,
3321 SYNTHETIC_ERRNO(EOPNOTSUPP
),
3322 "Failed to set up namespace, and refusing to continue since "
3323 "the selected namespacing options alter mount environment non-trivially.\n"
3324 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3326 context
->n_temporary_filesystems
,
3329 yes_no(context
->dynamic_user
));
3331 log_unit_debug(u
, "Failed to set up namespace, assuming containerized execution and ignoring.");
3338 static int apply_working_directory(
3339 const ExecContext
*context
,
3340 const ExecParameters
*params
,
3341 ExecRuntime
*runtime
,
3348 assert(exit_status
);
3350 if (context
->working_directory_home
) {
3353 *exit_status
= EXIT_CHDIR
;
3360 wd
= empty_to_root(context
->working_directory
);
3362 if (params
->flags
& EXEC_APPLY_CHROOT
)
3365 d
= prefix_roota((runtime
? runtime
->ephemeral_copy
: NULL
) ?: context
->root_directory
, wd
);
3367 if (chdir(d
) < 0 && !context
->working_directory_missing_ok
) {
3368 *exit_status
= EXIT_CHDIR
;
3375 static int apply_root_directory(
3376 const ExecContext
*context
,
3377 const ExecParameters
*params
,
3378 ExecRuntime
*runtime
,
3379 const bool needs_mount_ns
,
3383 assert(exit_status
);
3385 if (params
->flags
& EXEC_APPLY_CHROOT
)
3386 if (!needs_mount_ns
&& context
->root_directory
)
3387 if (chroot((runtime
? runtime
->ephemeral_copy
: NULL
) ?: context
->root_directory
) < 0) {
3388 *exit_status
= EXIT_CHROOT
;
3395 static int setup_keyring(
3397 const ExecContext
*context
,
3398 const ExecParameters
*p
,
3399 uid_t uid
, gid_t gid
) {
3401 key_serial_t keyring
;
3410 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3411 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3412 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3413 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3414 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3415 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3417 if (context
->keyring_mode
== EXEC_KEYRING_INHERIT
)
3420 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3421 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3422 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3423 * & group is just as nasty as acquiring a reference to the user keyring. */
3425 saved_uid
= getuid();
3426 saved_gid
= getgid();
3428 if (gid_is_valid(gid
) && gid
!= saved_gid
) {
3429 if (setregid(gid
, -1) < 0)
3430 return log_unit_error_errno(u
, errno
, "Failed to change GID for user keyring: %m");
3433 if (uid_is_valid(uid
) && uid
!= saved_uid
) {
3434 if (setreuid(uid
, -1) < 0) {
3435 r
= log_unit_error_errno(u
, errno
, "Failed to change UID for user keyring: %m");
3440 keyring
= keyctl(KEYCTL_JOIN_SESSION_KEYRING
, 0, 0, 0, 0);
3441 if (keyring
== -1) {
3442 if (errno
== ENOSYS
)
3443 log_unit_debug_errno(u
, errno
, "Kernel keyring not supported, ignoring.");
3444 else if (ERRNO_IS_PRIVILEGE(errno
))
3445 log_unit_debug_errno(u
, errno
, "Kernel keyring access prohibited, ignoring.");
3446 else if (errno
== EDQUOT
)
3447 log_unit_debug_errno(u
, errno
, "Out of kernel keyrings to allocate, ignoring.");
3449 r
= log_unit_error_errno(u
, errno
, "Setting up kernel keyring failed: %m");
3454 /* When requested link the user keyring into the session keyring. */
3455 if (context
->keyring_mode
== EXEC_KEYRING_SHARED
) {
3457 if (keyctl(KEYCTL_LINK
,
3458 KEY_SPEC_USER_KEYRING
,
3459 KEY_SPEC_SESSION_KEYRING
, 0, 0) < 0) {
3460 r
= log_unit_error_errno(u
, errno
, "Failed to link user keyring into session keyring: %m");
3465 /* Restore uid/gid back */
3466 if (uid_is_valid(uid
) && uid
!= saved_uid
) {
3467 if (setreuid(saved_uid
, -1) < 0) {
3468 r
= log_unit_error_errno(u
, errno
, "Failed to change UID back for user keyring: %m");
3473 if (gid_is_valid(gid
) && gid
!= saved_gid
) {
3474 if (setregid(saved_gid
, -1) < 0)
3475 return log_unit_error_errno(u
, errno
, "Failed to change GID back for user keyring: %m");
3478 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
3479 if (!sd_id128_is_null(u
->invocation_id
)) {
3482 key
= add_key("user", "invocation_id", &u
->invocation_id
, sizeof(u
->invocation_id
), KEY_SPEC_SESSION_KEYRING
);
3484 log_unit_debug_errno(u
, errno
, "Failed to add invocation ID to keyring, ignoring: %m");
3486 if (keyctl(KEYCTL_SETPERM
, key
,
3487 KEY_POS_VIEW
|KEY_POS_READ
|KEY_POS_SEARCH
|
3488 KEY_USR_VIEW
|KEY_USR_READ
|KEY_USR_SEARCH
, 0, 0) < 0)
3489 r
= log_unit_error_errno(u
, errno
, "Failed to restrict invocation ID permission: %m");
3494 /* Revert back uid & gid for the last time, and exit */
3495 /* no extra logging, as only the first already reported error matters */
3496 if (getuid() != saved_uid
)
3497 (void) setreuid(saved_uid
, -1);
3499 if (getgid() != saved_gid
)
3500 (void) setregid(saved_gid
, -1);
3505 static void append_socket_pair(int *array
, size_t *n
, const int pair
[static 2]) {
3511 array
[(*n
)++] = pair
[0];
3513 array
[(*n
)++] = pair
[1];
3516 static int close_remaining_fds(
3517 const ExecParameters
*params
,
3518 const ExecRuntime
*runtime
,
3521 const int *fds
, size_t n_fds
) {
3523 size_t n_dont_close
= 0;
3524 int dont_close
[n_fds
+ 14];
3528 if (params
->stdin_fd
>= 0)
3529 dont_close
[n_dont_close
++] = params
->stdin_fd
;
3530 if (params
->stdout_fd
>= 0)
3531 dont_close
[n_dont_close
++] = params
->stdout_fd
;
3532 if (params
->stderr_fd
>= 0)
3533 dont_close
[n_dont_close
++] = params
->stderr_fd
;
3536 dont_close
[n_dont_close
++] = socket_fd
;
3538 memcpy(dont_close
+ n_dont_close
, fds
, sizeof(int) * n_fds
);
3539 n_dont_close
+= n_fds
;
3543 append_socket_pair(dont_close
, &n_dont_close
, runtime
->ephemeral_storage_socket
);
3545 if (runtime
&& runtime
->shared
) {
3546 append_socket_pair(dont_close
, &n_dont_close
, runtime
->shared
->netns_storage_socket
);
3547 append_socket_pair(dont_close
, &n_dont_close
, runtime
->shared
->ipcns_storage_socket
);
3550 if (runtime
&& runtime
->dynamic_creds
) {
3551 if (runtime
->dynamic_creds
->user
)
3552 append_socket_pair(dont_close
, &n_dont_close
, runtime
->dynamic_creds
->user
->storage_socket
);
3553 if (runtime
->dynamic_creds
->group
)
3554 append_socket_pair(dont_close
, &n_dont_close
, runtime
->dynamic_creds
->group
->storage_socket
);
3557 if (user_lookup_fd
>= 0)
3558 dont_close
[n_dont_close
++] = user_lookup_fd
;
3560 return close_all_fds(dont_close
, n_dont_close
);
3563 static int send_user_lookup(
3571 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3572 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3575 if (user_lookup_fd
< 0)
3578 if (!uid_is_valid(uid
) && !gid_is_valid(gid
))
3581 if (writev(user_lookup_fd
,
3583 IOVEC_MAKE(&uid
, sizeof(uid
)),
3584 IOVEC_MAKE(&gid
, sizeof(gid
)),
3585 IOVEC_MAKE_STRING(unit
->id
) }, 3) < 0)
3591 static int acquire_home(const ExecContext
*c
, uid_t uid
, const char** home
, char **buf
) {
3598 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3603 if (!c
->working_directory_home
)
3606 r
= get_home_dir(buf
);
3614 static int compile_suggested_paths(const ExecContext
*c
, const ExecParameters
*p
, char ***ret
) {
3615 _cleanup_strv_free_
char ** list
= NULL
;
3622 assert(c
->dynamic_user
);
3624 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3625 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3628 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
3629 if (t
== EXEC_DIRECTORY_CONFIGURATION
)
3635 for (size_t i
= 0; i
< c
->directories
[t
].n_items
; i
++) {
3638 if (exec_directory_is_private(c
, t
))
3639 e
= path_join(p
->prefix
[t
], "private", c
->directories
[t
].items
[i
].path
);
3641 e
= path_join(p
->prefix
[t
], c
->directories
[t
].items
[i
].path
);
3645 r
= strv_consume(&list
, e
);
3651 *ret
= TAKE_PTR(list
);
3656 static int exec_parameters_get_cgroup_path(
3657 const ExecParameters
*params
,
3658 const CGroupContext
*c
,
3661 const char *subgroup
= NULL
;
3667 if (!params
->cgroup_path
)
3670 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
3671 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
3672 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
3673 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
3674 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
3675 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
3676 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
3677 * flag, which is only passed for the former statements, not for the latter. */
3679 if (FLAGS_SET(params
->flags
, EXEC_CGROUP_DELEGATE
) && (FLAGS_SET(params
->flags
, EXEC_CONTROL_CGROUP
) || c
->delegate_subgroup
)) {
3680 if (FLAGS_SET(params
->flags
, EXEC_IS_CONTROL
))
3681 subgroup
= ".control";
3683 subgroup
= c
->delegate_subgroup
;
3687 p
= path_join(params
->cgroup_path
, subgroup
);
3689 p
= strdup(params
->cgroup_path
);
3697 static int exec_context_cpu_affinity_from_numa(const ExecContext
*c
, CPUSet
*ret
) {
3698 _cleanup_(cpu_set_reset
) CPUSet s
= {};
3704 if (!c
->numa_policy
.nodes
.set
) {
3705 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3709 r
= numa_to_cpu_set(&c
->numa_policy
, &s
);
3715 return cpu_set_add_all(ret
, &s
);
3718 bool exec_context_get_cpu_affinity_from_numa(const ExecContext
*c
) {
3721 return c
->cpu_affinity_from_numa
;
3724 static int add_shifted_fd(int *fds
, size_t fds_size
, size_t *n_fds
, int fd
, int *ret_fd
) {
3729 assert(*n_fds
< fds_size
);
3737 if (fd
< 3 + (int) *n_fds
) {
3738 /* Let's move the fd up, so that it's outside of the fd range we will use to store
3739 * the fds we pass to the process (or which are closed only during execve). */
3741 r
= fcntl(fd
, F_DUPFD_CLOEXEC
, 3 + (int) *n_fds
);
3745 close_and_replace(fd
, r
);
3748 *ret_fd
= fds
[*n_fds
] = fd
;
3753 static int connect_unix_harder(Unit
*u
, const OpenFile
*of
, int ofd
) {
3754 union sockaddr_union addr
= {
3755 .un
.sun_family
= AF_UNIX
,
3758 static const int socket_types
[] = { SOCK_DGRAM
, SOCK_STREAM
, SOCK_SEQPACKET
};
3765 r
= sockaddr_un_set_path(&addr
.un
, FORMAT_PROC_FD_PATH(ofd
));
3767 return log_unit_error_errno(u
, r
, "Failed to set sockaddr for %s: %m", of
->path
);
3771 for (size_t i
= 0; i
< ELEMENTSOF(socket_types
); i
++) {
3772 _cleanup_close_
int fd
= -EBADF
;
3774 fd
= socket(AF_UNIX
, socket_types
[i
] | SOCK_CLOEXEC
, 0);
3776 return log_unit_error_errno(u
, errno
, "Failed to create socket for %s: %m", of
->path
);
3778 r
= RET_NERRNO(connect(fd
, &addr
.sa
, sa_len
));
3779 if (r
== -EPROTOTYPE
)
3782 return log_unit_error_errno(u
, r
, "Failed to connect socket for %s: %m", of
->path
);
3787 return log_unit_error_errno(u
, SYNTHETIC_ERRNO(EPROTOTYPE
), "Failed to connect socket for \"%s\".", of
->path
);
3790 static int get_open_file_fd(Unit
*u
, const OpenFile
*of
) {
3792 _cleanup_close_
int fd
= -EBADF
, ofd
= -EBADF
;
3797 ofd
= open(of
->path
, O_PATH
| O_CLOEXEC
);
3799 return log_unit_error_errno(u
, errno
, "Could not open \"%s\": %m", of
->path
);
3801 if (fstat(ofd
, &st
) < 0)
3802 return log_unit_error_errno(u
, errno
, "Failed to stat %s: %m", of
->path
);
3804 if (S_ISSOCK(st
.st_mode
)) {
3805 fd
= connect_unix_harder(u
, of
, ofd
);
3809 if (FLAGS_SET(of
->flags
, OPENFILE_READ_ONLY
) && shutdown(fd
, SHUT_WR
) < 0)
3810 return log_unit_error_errno(u
, errno
, "Failed to shutdown send for socket %s: %m",
3813 log_unit_debug(u
, "socket %s opened (fd=%d)", of
->path
, fd
);
3815 int flags
= FLAGS_SET(of
->flags
, OPENFILE_READ_ONLY
) ? O_RDONLY
: O_RDWR
;
3816 if (FLAGS_SET(of
->flags
, OPENFILE_APPEND
))
3818 else if (FLAGS_SET(of
->flags
, OPENFILE_TRUNCATE
))
3821 fd
= fd_reopen(ofd
, flags
| O_CLOEXEC
);
3823 return log_unit_error_errno(u
, fd
, "Failed to open file %s: %m", of
->path
);
3825 log_unit_debug(u
, "file %s opened (fd=%d)", of
->path
, fd
);
3831 static int collect_open_file_fds(
3833 OpenFile
* open_files
,
3844 LIST_FOREACH(open_files
, of
, open_files
) {
3845 _cleanup_close_
int fd
= -EBADF
;
3847 fd
= get_open_file_fd(u
, of
);
3849 if (FLAGS_SET(of
->flags
, OPENFILE_GRACEFUL
)) {
3850 log_unit_debug_errno(u
, fd
, "Failed to get OpenFile= file descriptor for %s, ignoring: %m", of
->path
);
3857 if (!GREEDY_REALLOC(*fds
, *n_fds
+ 1))
3860 r
= strv_extend(fdnames
, of
->fdname
);
3864 (*fds
)[*n_fds
] = TAKE_FD(fd
);
3872 static void log_command_line(Unit
*unit
, const char *msg
, const char *executable
, char **argv
) {
3880 _cleanup_free_
char *cmdline
= quote_command_line(argv
, SHELL_ESCAPE_EMPTY
);
3882 log_unit_struct(unit
, LOG_DEBUG
,
3883 "EXECUTABLE=%s", executable
,
3884 LOG_UNIT_MESSAGE(unit
, "%s: %s", msg
, strnull(cmdline
)),
3885 LOG_UNIT_INVOCATION_ID(unit
));
3888 static bool exec_context_need_unprivileged_private_users(
3889 const ExecContext
*context
,
3890 const ExecParameters
*params
) {
3895 /* These options require PrivateUsers= when used in user units, as we need to be in a user namespace
3896 * to have permission to enable them when not running as root. If we have effective CAP_SYS_ADMIN
3897 * (system manager) then we have privileges and don't need this. */
3898 if (params
->runtime_scope
!= RUNTIME_SCOPE_USER
)
3901 return context
->private_users
||
3902 context
->private_tmp
||
3903 context
->private_devices
||
3904 context
->private_network
||
3905 context
->network_namespace_path
||
3906 context
->private_ipc
||
3907 context
->ipc_namespace_path
||
3908 context
->private_mounts
> 0 ||
3909 context
->mount_apivfs
||
3910 context
->n_bind_mounts
> 0 ||
3911 context
->n_temporary_filesystems
> 0 ||
3912 context
->root_directory
||
3913 !strv_isempty(context
->extension_directories
) ||
3914 context
->protect_system
!= PROTECT_SYSTEM_NO
||
3915 context
->protect_home
!= PROTECT_HOME_NO
||
3916 context
->protect_kernel_tunables
||
3917 context
->protect_kernel_modules
||
3918 context
->protect_kernel_logs
||
3919 context
->protect_control_groups
||
3920 context
->protect_clock
||
3921 context
->protect_hostname
||
3922 !strv_isempty(context
->read_write_paths
) ||
3923 !strv_isempty(context
->read_only_paths
) ||
3924 !strv_isempty(context
->inaccessible_paths
) ||
3925 !strv_isempty(context
->exec_paths
) ||
3926 !strv_isempty(context
->no_exec_paths
);
3929 static int exec_child(
3931 const ExecCommand
*command
,
3932 const ExecContext
*context
,
3933 const ExecParameters
*params
,
3934 ExecRuntime
*runtime
,
3935 const CGroupContext
*cgroup_context
,
3937 const int named_iofds
[static 3],
3939 size_t n_socket_fds
,
3940 size_t n_storage_fds
,
3945 _cleanup_strv_free_
char **our_env
= NULL
, **pass_env
= NULL
, **joined_exec_search_path
= NULL
, **accum_env
= NULL
, **replaced_argv
= NULL
;
3946 int r
, ngids
= 0, exec_fd
;
3947 _cleanup_free_ gid_t
*supplementary_gids
= NULL
;
3948 const char *username
= NULL
, *groupname
= NULL
;
3949 _cleanup_free_
char *home_buffer
= NULL
, *memory_pressure_path
= NULL
;
3950 const char *home
= NULL
, *shell
= NULL
;
3951 char **final_argv
= NULL
;
3952 dev_t journal_stream_dev
= 0;
3953 ino_t journal_stream_ino
= 0;
3954 bool userns_set_up
= false;
3955 bool needs_sandboxing
, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
3956 needs_setuid
, /* Do we need to do the actual setresuid()/setresgid() calls? */
3957 needs_mount_namespace
, /* Do we need to set up a mount namespace for this kernel? */
3958 needs_ambient_hack
; /* Do we need to apply the ambient capabilities hack? */
3960 _cleanup_free_
char *mac_selinux_context_net
= NULL
;
3961 bool use_selinux
= false;
3964 bool use_smack
= false;
3967 bool use_apparmor
= false;
3969 uid_t saved_uid
= getuid();
3970 gid_t saved_gid
= getgid();
3971 uid_t uid
= UID_INVALID
;
3972 gid_t gid
= GID_INVALID
;
3973 size_t n_fds
= n_socket_fds
+ n_storage_fds
, /* fds to pass to the child */
3974 n_keep_fds
; /* total number of fds not to close */
3976 _cleanup_free_ gid_t
*gids_after_pam
= NULL
;
3977 int ngids_after_pam
= 0;
3978 _cleanup_free_
int *fds
= NULL
;
3979 _cleanup_strv_free_
char **fdnames
= NULL
;
3985 assert(exit_status
);
3987 /* Explicitly test for CVE-2021-4034 inspired invocations */
3988 assert(command
->path
);
3989 assert(!strv_isempty(command
->argv
));
3991 rename_process_from_path(command
->path
);
3993 /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
3994 * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
3995 * both of which will be demoted to SIG_DFL. */
3996 (void) default_signals(SIGNALS_CRASH_HANDLER
,
3999 if (context
->ignore_sigpipe
)
4000 (void) ignore_signals(SIGPIPE
);
4002 r
= reset_signal_mask();
4004 *exit_status
= EXIT_SIGNAL_MASK
;
4005 return log_unit_error_errno(unit
, r
, "Failed to set process signal mask: %m");
4008 if (params
->idle_pipe
)
4009 do_idle_pipe_dance(params
->idle_pipe
);
4011 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4012 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4013 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4014 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
4017 log_set_open_when_needed(true);
4018 log_settle_target();
4020 /* In case anything used libc syslog(), close this here, too */
4023 fds
= newdup(int, params_fds
, n_fds
);
4025 *exit_status
= EXIT_MEMORY
;
4029 fdnames
= strv_copy((char**) params
->fd_names
);
4031 *exit_status
= EXIT_MEMORY
;
4035 r
= collect_open_file_fds(unit
, params
->open_files
, &fds
, &fdnames
, &n_fds
);
4037 *exit_status
= EXIT_FDS
;
4038 return log_unit_error_errno(unit
, r
, "Failed to get OpenFile= file descriptors: %m");
4041 int keep_fds
[n_fds
+ 3];
4042 memcpy_safe(keep_fds
, fds
, n_fds
* sizeof(int));
4045 r
= add_shifted_fd(keep_fds
, ELEMENTSOF(keep_fds
), &n_keep_fds
, params
->exec_fd
, &exec_fd
);
4047 *exit_status
= EXIT_FDS
;
4048 return log_unit_error_errno(unit
, r
, "Failed to shift fd and set FD_CLOEXEC: %m");
4052 if (unit
->manager
->restrict_fs
) {
4053 int bpf_map_fd
= lsm_bpf_map_restrict_fs_fd(unit
);
4054 if (bpf_map_fd
< 0) {
4055 *exit_status
= EXIT_FDS
;
4056 return log_unit_error_errno(unit
, bpf_map_fd
, "Failed to get restrict filesystems BPF map fd: %m");
4059 r
= add_shifted_fd(keep_fds
, ELEMENTSOF(keep_fds
), &n_keep_fds
, bpf_map_fd
, &bpf_map_fd
);
4061 *exit_status
= EXIT_FDS
;
4062 return log_unit_error_errno(unit
, r
, "Failed to shift fd and set FD_CLOEXEC: %m");
4067 r
= close_remaining_fds(params
, runtime
, user_lookup_fd
, socket_fd
, keep_fds
, n_keep_fds
);
4069 *exit_status
= EXIT_FDS
;
4070 return log_unit_error_errno(unit
, r
, "Failed to close unwanted file descriptors: %m");
4073 if (!context
->same_pgrp
&&
4075 *exit_status
= EXIT_SETSID
;
4076 return log_unit_error_errno(unit
, errno
, "Failed to create new process session: %m");
4079 exec_context_tty_reset(context
, params
);
4081 if (unit_shall_confirm_spawn(unit
)) {
4082 _cleanup_free_
char *cmdline
= NULL
;
4084 cmdline
= quote_command_line(command
->argv
, SHELL_ESCAPE_EMPTY
);
4086 *exit_status
= EXIT_MEMORY
;
4090 r
= ask_for_confirmation(context
, params
->confirm_spawn
, unit
, cmdline
);
4091 if (r
!= CONFIRM_EXECUTE
) {
4092 if (r
== CONFIRM_PRETEND_SUCCESS
) {
4093 *exit_status
= EXIT_SUCCESS
;
4097 *exit_status
= EXIT_CONFIRM
;
4098 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(ECANCELED
),
4099 "Execution cancelled by the user");
4103 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4104 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4105 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4106 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4107 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4108 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit
->id
, true) != 0 ||
4109 setenv("SYSTEMD_ACTIVATION_SCOPE", runtime_scope_to_string(params
->runtime_scope
), true) != 0) {
4110 *exit_status
= EXIT_MEMORY
;
4111 return log_unit_error_errno(unit
, errno
, "Failed to update environment: %m");
4114 if (context
->dynamic_user
&& runtime
&& runtime
->dynamic_creds
) {
4115 _cleanup_strv_free_
char **suggested_paths
= NULL
;
4117 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
4118 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
4119 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4120 *exit_status
= EXIT_USER
;
4121 return log_unit_error_errno(unit
, errno
, "Failed to update environment: %m");
4124 r
= compile_suggested_paths(context
, params
, &suggested_paths
);
4126 *exit_status
= EXIT_MEMORY
;
4130 r
= dynamic_creds_realize(runtime
->dynamic_creds
, suggested_paths
, &uid
, &gid
);
4132 *exit_status
= EXIT_USER
;
4134 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(EOPNOTSUPP
),
4135 "Failed to update dynamic user credentials: User or group with specified name already exists.");
4136 return log_unit_error_errno(unit
, r
, "Failed to update dynamic user credentials: %m");
4139 if (!uid_is_valid(uid
)) {
4140 *exit_status
= EXIT_USER
;
4141 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(ESRCH
), "UID validation failed for \""UID_FMT
"\"", uid
);
4144 if (!gid_is_valid(gid
)) {
4145 *exit_status
= EXIT_USER
;
4146 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(ESRCH
), "GID validation failed for \""GID_FMT
"\"", gid
);
4149 if (runtime
->dynamic_creds
->user
)
4150 username
= runtime
->dynamic_creds
->user
->name
;
4153 r
= get_fixed_user(context
, &username
, &uid
, &gid
, &home
, &shell
);
4155 *exit_status
= EXIT_USER
;
4156 return log_unit_error_errno(unit
, r
, "Failed to determine user credentials: %m");
4159 r
= get_fixed_group(context
, &groupname
, &gid
);
4161 *exit_status
= EXIT_GROUP
;
4162 return log_unit_error_errno(unit
, r
, "Failed to determine group credentials: %m");
4166 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4167 r
= get_supplementary_groups(context
, username
, groupname
, gid
,
4168 &supplementary_gids
, &ngids
);
4170 *exit_status
= EXIT_GROUP
;
4171 return log_unit_error_errno(unit
, r
, "Failed to determine supplementary groups: %m");
4174 r
= send_user_lookup(unit
, user_lookup_fd
, uid
, gid
);
4176 *exit_status
= EXIT_USER
;
4177 return log_unit_error_errno(unit
, r
, "Failed to send user credentials to PID1: %m");
4180 user_lookup_fd
= safe_close(user_lookup_fd
);
4182 r
= acquire_home(context
, uid
, &home
, &home_buffer
);
4184 *exit_status
= EXIT_CHDIR
;
4185 return log_unit_error_errno(unit
, r
, "Failed to determine $HOME for user: %m");
4188 /* If a socket is connected to STDIN/STDOUT/STDERR, we must drop O_NONBLOCK */
4190 (void) fd_nonblock(socket_fd
, false);
4192 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4193 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4194 if (params
->cgroup_path
) {
4195 _cleanup_free_
char *p
= NULL
;
4197 r
= exec_parameters_get_cgroup_path(params
, cgroup_context
, &p
);
4199 *exit_status
= EXIT_CGROUP
;
4200 return log_unit_error_errno(unit
, r
, "Failed to acquire cgroup path: %m");
4203 r
= cg_attach_everywhere(params
->cgroup_supported
, p
, 0, NULL
, NULL
);
4204 if (r
== -EUCLEAN
) {
4205 *exit_status
= EXIT_CGROUP
;
4206 return log_unit_error_errno(unit
, r
, "Failed to attach process to cgroup %s "
4207 "because the cgroup or one of its parents or "
4208 "siblings is in the threaded mode: %m", p
);
4211 *exit_status
= EXIT_CGROUP
;
4212 return log_unit_error_errno(unit
, r
, "Failed to attach to cgroup %s: %m", p
);
4216 if (context
->network_namespace_path
&& runtime
&& runtime
->shared
&& runtime
->shared
->netns_storage_socket
[0] >= 0) {
4217 r
= open_shareable_ns_path(runtime
->shared
->netns_storage_socket
, context
->network_namespace_path
, CLONE_NEWNET
);
4219 *exit_status
= EXIT_NETWORK
;
4220 return log_unit_error_errno(unit
, r
, "Failed to open network namespace path %s: %m", context
->network_namespace_path
);
4224 if (context
->ipc_namespace_path
&& runtime
&& runtime
->shared
&& runtime
->shared
->ipcns_storage_socket
[0] >= 0) {
4225 r
= open_shareable_ns_path(runtime
->shared
->ipcns_storage_socket
, context
->ipc_namespace_path
, CLONE_NEWIPC
);
4227 *exit_status
= EXIT_NAMESPACE
;
4228 return log_unit_error_errno(unit
, r
, "Failed to open IPC namespace path %s: %m", context
->ipc_namespace_path
);
4232 r
= setup_input(context
, params
, socket_fd
, named_iofds
);
4234 *exit_status
= EXIT_STDIN
;
4235 return log_unit_error_errno(unit
, r
, "Failed to set up standard input: %m");
4238 r
= setup_output(unit
, context
, params
, STDOUT_FILENO
, socket_fd
, named_iofds
, basename(command
->path
), uid
, gid
, &journal_stream_dev
, &journal_stream_ino
);
4240 *exit_status
= EXIT_STDOUT
;
4241 return log_unit_error_errno(unit
, r
, "Failed to set up standard output: %m");
4244 r
= setup_output(unit
, context
, params
, STDERR_FILENO
, socket_fd
, named_iofds
, basename(command
->path
), uid
, gid
, &journal_stream_dev
, &journal_stream_ino
);
4246 *exit_status
= EXIT_STDERR
;
4247 return log_unit_error_errno(unit
, r
, "Failed to set up standard error output: %m");
4250 if (context
->oom_score_adjust_set
) {
4251 /* When we can't make this change due to EPERM, then let's silently skip over it. User
4252 * namespaces prohibit write access to this file, and we shouldn't trip up over that. */
4253 r
= set_oom_score_adjust(context
->oom_score_adjust
);
4254 if (ERRNO_IS_NEG_PRIVILEGE(r
))
4255 log_unit_debug_errno(unit
, r
,
4256 "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
4258 *exit_status
= EXIT_OOM_ADJUST
;
4259 return log_unit_error_errno(unit
, r
, "Failed to adjust OOM setting: %m");
4263 if (context
->coredump_filter_set
) {
4264 r
= set_coredump_filter(context
->coredump_filter
);
4265 if (ERRNO_IS_NEG_PRIVILEGE(r
))
4266 log_unit_debug_errno(unit
, r
, "Failed to adjust coredump_filter, ignoring: %m");
4268 *exit_status
= EXIT_LIMITS
;
4269 return log_unit_error_errno(unit
, r
, "Failed to adjust coredump_filter: %m");
4273 if (context
->nice_set
) {
4274 r
= setpriority_closest(context
->nice
);
4276 *exit_status
= EXIT_NICE
;
4277 return log_unit_error_errno(unit
, r
, "Failed to set up process scheduling priority (nice level): %m");
4281 if (context
->cpu_sched_set
) {
4282 struct sched_param param
= {
4283 .sched_priority
= context
->cpu_sched_priority
,
4286 r
= sched_setscheduler(0,
4287 context
->cpu_sched_policy
|
4288 (context
->cpu_sched_reset_on_fork
?
4289 SCHED_RESET_ON_FORK
: 0),
4292 *exit_status
= EXIT_SETSCHEDULER
;
4293 return log_unit_error_errno(unit
, errno
, "Failed to set up CPU scheduling: %m");
4297 if (context
->cpu_affinity_from_numa
|| context
->cpu_set
.set
) {
4298 _cleanup_(cpu_set_reset
) CPUSet converted_cpu_set
= {};
4299 const CPUSet
*cpu_set
;
4301 if (context
->cpu_affinity_from_numa
) {
4302 r
= exec_context_cpu_affinity_from_numa(context
, &converted_cpu_set
);
4304 *exit_status
= EXIT_CPUAFFINITY
;
4305 return log_unit_error_errno(unit
, r
, "Failed to derive CPU affinity mask from NUMA mask: %m");
4308 cpu_set
= &converted_cpu_set
;
4310 cpu_set
= &context
->cpu_set
;
4312 if (sched_setaffinity(0, cpu_set
->allocated
, cpu_set
->set
) < 0) {
4313 *exit_status
= EXIT_CPUAFFINITY
;
4314 return log_unit_error_errno(unit
, errno
, "Failed to set up CPU affinity: %m");
4318 if (mpol_is_valid(numa_policy_get_type(&context
->numa_policy
))) {
4319 r
= apply_numa_policy(&context
->numa_policy
);
4320 if (ERRNO_IS_NEG_NOT_SUPPORTED(r
))
4321 log_unit_debug_errno(unit
, r
, "NUMA support not available, ignoring.");
4323 *exit_status
= EXIT_NUMA_POLICY
;
4324 return log_unit_error_errno(unit
, r
, "Failed to set NUMA memory policy: %m");
4328 if (context
->ioprio_set
)
4329 if (ioprio_set(IOPRIO_WHO_PROCESS
, 0, context
->ioprio
) < 0) {
4330 *exit_status
= EXIT_IOPRIO
;
4331 return log_unit_error_errno(unit
, errno
, "Failed to set up IO scheduling priority: %m");
4334 if (context
->timer_slack_nsec
!= NSEC_INFINITY
)
4335 if (prctl(PR_SET_TIMERSLACK
, context
->timer_slack_nsec
) < 0) {
4336 *exit_status
= EXIT_TIMERSLACK
;
4337 return log_unit_error_errno(unit
, errno
, "Failed to set up timer slack: %m");
4340 if (context
->personality
!= PERSONALITY_INVALID
) {
4341 r
= safe_personality(context
->personality
);
4343 *exit_status
= EXIT_PERSONALITY
;
4344 return log_unit_error_errno(unit
, r
, "Failed to set up execution domain (personality): %m");
4348 if (context
->utmp_id
) {
4349 const char *line
= context
->tty_path
?
4350 (path_startswith(context
->tty_path
, "/dev/") ?: context
->tty_path
) :
4352 utmp_put_init_process(context
->utmp_id
, getpid_cached(), getsid(0),
4354 context
->utmp_mode
== EXEC_UTMP_INIT
? INIT_PROCESS
:
4355 context
->utmp_mode
== EXEC_UTMP_LOGIN
? LOGIN_PROCESS
:
4360 if (uid_is_valid(uid
)) {
4361 r
= chown_terminal(STDIN_FILENO
, uid
);
4363 *exit_status
= EXIT_STDIN
;
4364 return log_unit_error_errno(unit
, r
, "Failed to change ownership of terminal: %m");
4368 if (params
->cgroup_path
) {
4369 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
4370 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4371 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
4372 * touch a single hierarchy too. */
4374 if (params
->flags
& EXEC_CGROUP_DELEGATE
) {
4375 _cleanup_free_
char *p
= NULL
;
4377 r
= cg_set_access(SYSTEMD_CGROUP_CONTROLLER
, params
->cgroup_path
, uid
, gid
);
4379 *exit_status
= EXIT_CGROUP
;
4380 return log_unit_error_errno(unit
, r
, "Failed to adjust control group access: %m");
4383 r
= exec_parameters_get_cgroup_path(params
, cgroup_context
, &p
);
4385 *exit_status
= EXIT_CGROUP
;
4386 return log_unit_error_errno(unit
, r
, "Failed to acquire cgroup path: %m");
4389 r
= cg_set_access_recursive(SYSTEMD_CGROUP_CONTROLLER
, p
, uid
, gid
);
4391 *exit_status
= EXIT_CGROUP
;
4392 return log_unit_error_errno(unit
, r
, "Failed to adjust control subgroup access: %m");
4397 if (cgroup_context
&& cg_unified() > 0 && is_pressure_supported() > 0) {
4398 if (cgroup_context_want_memory_pressure(cgroup_context
)) {
4399 r
= cg_get_path("memory", params
->cgroup_path
, "memory.pressure", &memory_pressure_path
);
4401 *exit_status
= EXIT_MEMORY
;
4405 r
= chmod_and_chown(memory_pressure_path
, 0644, uid
, gid
);
4407 log_unit_full_errno(unit
, r
== -ENOENT
|| ERRNO_IS_PRIVILEGE(r
) ? LOG_DEBUG
: LOG_WARNING
, r
,
4408 "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path
);
4409 memory_pressure_path
= mfree(memory_pressure_path
);
4411 } else if (cgroup_context
->memory_pressure_watch
== CGROUP_PRESSURE_WATCH_OFF
) {
4412 memory_pressure_path
= strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
4413 if (!memory_pressure_path
) {
4414 *exit_status
= EXIT_MEMORY
;
4421 needs_mount_namespace
= exec_needs_mount_namespace(context
, params
, runtime
);
4423 for (ExecDirectoryType dt
= 0; dt
< _EXEC_DIRECTORY_TYPE_MAX
; dt
++) {
4424 r
= setup_exec_directory(unit
, context
, params
, uid
, gid
, dt
, needs_mount_namespace
, exit_status
);
4426 return log_unit_error_errno(unit
, r
, "Failed to set up special execution directory in %s: %m", params
->prefix
[dt
]);
4429 if (FLAGS_SET(params
->flags
, EXEC_WRITE_CREDENTIALS
)) {
4430 r
= setup_credentials(context
, params
, unit
->id
, uid
, gid
);
4432 *exit_status
= EXIT_CREDENTIALS
;
4433 return log_unit_error_errno(unit
, r
, "Failed to set up credentials: %m");
4437 r
= build_environment(
4449 memory_pressure_path
,
4452 *exit_status
= EXIT_MEMORY
;
4456 r
= build_pass_environment(context
, &pass_env
);
4458 *exit_status
= EXIT_MEMORY
;
4462 /* The $PATH variable is set to the default path in params->environment. However, this is overridden
4463 * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
4464 * not specify PATH but the unit has ExecSearchPath. */
4465 if (!strv_isempty(context
->exec_search_path
)) {
4466 _cleanup_free_
char *joined
= NULL
;
4468 joined
= strv_join(context
->exec_search_path
, ":");
4470 *exit_status
= EXIT_MEMORY
;
4474 r
= strv_env_assign(&joined_exec_search_path
, "PATH", joined
);
4476 *exit_status
= EXIT_MEMORY
;
4481 accum_env
= strv_env_merge(params
->environment
,
4483 joined_exec_search_path
,
4485 context
->environment
,
4488 *exit_status
= EXIT_MEMORY
;
4491 accum_env
= strv_env_clean(accum_env
);
4493 (void) umask(context
->umask
);
4495 r
= setup_keyring(unit
, context
, params
, uid
, gid
);
4497 *exit_status
= EXIT_KEYRING
;
4498 return log_unit_error_errno(unit
, r
, "Failed to set up kernel keyring: %m");
4501 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
4503 needs_sandboxing
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& EXEC_COMMAND_FULLY_PRIVILEGED
);
4505 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
4506 * for it, and the kernel doesn't actually support ambient caps. */
4507 needs_ambient_hack
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && (command
->flags
& EXEC_COMMAND_AMBIENT_MAGIC
) && !ambient_capabilities_supported();
4509 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
4510 * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
4512 if (needs_ambient_hack
)
4513 needs_setuid
= false;
4515 needs_setuid
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& (EXEC_COMMAND_FULLY_PRIVILEGED
|EXEC_COMMAND_NO_SETUID
));
4517 uint64_t capability_ambient_set
= context
->capability_ambient_set
;
4519 if (needs_sandboxing
) {
4520 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
4521 * /sys being present. The actual MAC context application will happen later, as late as
4522 * possible, to avoid impacting our own code paths. */
4525 use_selinux
= mac_selinux_use();
4528 use_smack
= mac_smack_use();
4531 use_apparmor
= mac_apparmor_use();
4535 if (needs_sandboxing
) {
4538 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4539 * is set here. (See below.) */
4541 r
= setrlimit_closest_all((const struct rlimit
* const *) context
->rlimit
, &which_failed
);
4543 *exit_status
= EXIT_LIMITS
;
4544 return log_unit_error_errno(unit
, r
, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed
));
4548 if (needs_setuid
&& context
->pam_name
&& username
) {
4549 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
4550 * wins here. (See above.) */
4552 /* All fds passed in the fds array will be closed in the pam child process. */
4553 r
= setup_pam(context
->pam_name
, username
, uid
, gid
, context
->tty_path
, &accum_env
, fds
, n_fds
);
4555 *exit_status
= EXIT_PAM
;
4556 return log_unit_error_errno(unit
, r
, "Failed to set up PAM session: %m");
4559 if (ambient_capabilities_supported()) {
4560 uint64_t ambient_after_pam
;
4562 /* PAM modules might have set some ambient caps. Query them here and merge them into
4563 * the caps we want to set in the end, so that we don't end up unsetting them. */
4564 r
= capability_get_ambient(&ambient_after_pam
);
4566 *exit_status
= EXIT_CAPABILITIES
;
4567 return log_unit_error_errno(unit
, r
, "Failed to query ambient caps: %m");
4570 capability_ambient_set
|= ambient_after_pam
;
4573 ngids_after_pam
= getgroups_alloc(&gids_after_pam
);
4574 if (ngids_after_pam
< 0) {
4575 *exit_status
= EXIT_MEMORY
;
4576 return log_unit_error_errno(unit
, ngids_after_pam
, "Failed to obtain groups after setting up PAM: %m");
4580 if (needs_sandboxing
&& exec_context_need_unprivileged_private_users(context
, params
)) {
4581 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4582 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4583 * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
4585 r
= setup_private_users(saved_uid
, saved_gid
, uid
, gid
);
4586 /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
4587 * the actual requested operations fail (or silently continue). */
4588 if (r
< 0 && context
->private_users
) {
4589 *exit_status
= EXIT_USER
;
4590 return log_unit_error_errno(unit
, r
, "Failed to set up user namespacing for unprivileged user: %m");
4593 log_unit_info_errno(unit
, r
, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
4595 userns_set_up
= true;
4598 if (exec_needs_network_namespace(context
) && runtime
&& runtime
->shared
&& runtime
->shared
->netns_storage_socket
[0] >= 0) {
4600 /* Try to enable network namespacing if network namespacing is available and we have
4601 * CAP_NET_ADMIN. We need CAP_NET_ADMIN to be able to configure the loopback device in the
4602 * new network namespace. And if we don't have that, then we could only create a network
4603 * namespace without the ability to set up "lo". Hence gracefully skip things then. */
4604 if (ns_type_supported(NAMESPACE_NET
) && have_effective_cap(CAP_NET_ADMIN
) > 0) {
4605 r
= setup_shareable_ns(runtime
->shared
->netns_storage_socket
, CLONE_NEWNET
);
4606 if (ERRNO_IS_NEG_PRIVILEGE(r
))
4607 log_unit_notice_errno(unit
, r
,
4608 "PrivateNetwork=yes is configured, but network namespace setup not permitted, proceeding without: %m");
4610 *exit_status
= EXIT_NETWORK
;
4611 return log_unit_error_errno(unit
, r
, "Failed to set up network namespacing: %m");
4613 } else if (context
->network_namespace_path
) {
4614 *exit_status
= EXIT_NETWORK
;
4615 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(EOPNOTSUPP
),
4616 "NetworkNamespacePath= is not supported, refusing.");
4618 log_unit_notice(unit
, "PrivateNetwork=yes is configured, but the kernel does not support or we lack privileges for network namespace, proceeding without.");
4621 if (exec_needs_ipc_namespace(context
) && runtime
&& runtime
->shared
&& runtime
->shared
->ipcns_storage_socket
[0] >= 0) {
4623 if (ns_type_supported(NAMESPACE_IPC
)) {
4624 r
= setup_shareable_ns(runtime
->shared
->ipcns_storage_socket
, CLONE_NEWIPC
);
4626 log_unit_warning_errno(unit
, r
,
4627 "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4629 *exit_status
= EXIT_NAMESPACE
;
4630 return log_unit_error_errno(unit
, r
, "Failed to set up IPC namespacing: %m");
4632 } else if (context
->ipc_namespace_path
) {
4633 *exit_status
= EXIT_NAMESPACE
;
4634 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(EOPNOTSUPP
),
4635 "IPCNamespacePath= is not supported, refusing.");
4637 log_unit_warning(unit
, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4640 if (needs_mount_namespace
) {
4641 _cleanup_free_
char *error_path
= NULL
;
4643 r
= apply_mount_namespace(unit
, command
->flags
, context
, params
, runtime
, memory_pressure_path
, &error_path
);
4645 *exit_status
= EXIT_NAMESPACE
;
4646 return log_unit_error_errno(unit
, r
, "Failed to set up mount namespacing%s%s: %m",
4647 error_path
? ": " : "", strempty(error_path
));
4651 if (needs_sandboxing
) {
4652 r
= apply_protect_hostname(unit
, context
, exit_status
);
4657 if (context
->memory_ksm
>= 0)
4658 if (prctl(PR_SET_MEMORY_MERGE
, context
->memory_ksm
) < 0) {
4659 if (ERRNO_IS_NOT_SUPPORTED(errno
))
4660 log_unit_debug_errno(unit
, errno
, "KSM support not available, ignoring.");
4662 *exit_status
= EXIT_KSM
;
4663 return log_unit_error_errno(unit
, errno
, "Failed to set KSM: %m");
4667 /* Drop groups as early as possible.
4668 * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
4669 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
4671 _cleanup_free_ gid_t
*gids_to_enforce
= NULL
;
4672 int ngids_to_enforce
= 0;
4674 ngids_to_enforce
= merge_gid_lists(supplementary_gids
,
4679 if (ngids_to_enforce
< 0) {
4680 *exit_status
= EXIT_MEMORY
;
4681 return log_unit_error_errno(unit
,
4683 "Failed to merge group lists. Group membership might be incorrect: %m");
4686 r
= enforce_groups(gid
, gids_to_enforce
, ngids_to_enforce
);
4688 *exit_status
= EXIT_GROUP
;
4689 return log_unit_error_errno(unit
, r
, "Changing group credentials failed: %m");
4693 /* If the user namespace was not set up above, try to do it now.
4694 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4695 * restricted by rules pertaining to combining user namespaces with other namespaces (e.g. in the
4696 * case of mount namespaces being less privileged when the mount point list is copied from a
4697 * different user namespace). */
4699 if (needs_sandboxing
&& context
->private_users
&& !userns_set_up
) {
4700 r
= setup_private_users(saved_uid
, saved_gid
, uid
, gid
);
4702 *exit_status
= EXIT_USER
;
4703 return log_unit_error_errno(unit
, r
, "Failed to set up user namespacing: %m");
4707 /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4710 _cleanup_free_
char *executable
= NULL
;
4711 _cleanup_close_
int executable_fd
= -EBADF
;
4712 r
= find_executable_full(command
->path
, /* root= */ NULL
, context
->exec_search_path
, false, &executable
, &executable_fd
);
4714 if (r
!= -ENOMEM
&& (command
->flags
& EXEC_COMMAND_IGNORE_FAILURE
)) {
4715 log_unit_struct_errno(unit
, LOG_INFO
, r
,
4716 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
4717 LOG_UNIT_INVOCATION_ID(unit
),
4718 LOG_UNIT_MESSAGE(unit
, "Executable %s missing, skipping: %m",
4720 "EXECUTABLE=%s", command
->path
);
4721 *exit_status
= EXIT_SUCCESS
;
4725 *exit_status
= EXIT_EXEC
;
4726 return log_unit_struct_errno(unit
, LOG_INFO
, r
,
4727 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
4728 LOG_UNIT_INVOCATION_ID(unit
),
4729 LOG_UNIT_MESSAGE(unit
, "Failed to locate executable %s: %m",
4731 "EXECUTABLE=%s", command
->path
);
4734 r
= add_shifted_fd(keep_fds
, ELEMENTSOF(keep_fds
), &n_keep_fds
, executable_fd
, &executable_fd
);
4736 *exit_status
= EXIT_FDS
;
4737 return log_unit_error_errno(unit
, r
, "Failed to shift fd and set FD_CLOEXEC: %m");
4741 if (needs_sandboxing
&& use_selinux
&& params
->selinux_context_net
) {
4746 else if (params
->n_socket_fds
== 1)
4747 /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4748 * use context from that fd to compute the label. */
4749 fd
= params
->fds
[0];
4752 r
= mac_selinux_get_child_mls_label(fd
, executable
, context
->selinux_context
, &mac_selinux_context_net
);
4754 if (!context
->selinux_context_ignore
) {
4755 *exit_status
= EXIT_SELINUX_CONTEXT
;
4756 return log_unit_error_errno(unit
, r
, "Failed to determine SELinux context: %m");
4758 log_unit_debug_errno(unit
, r
, "Failed to determine SELinux context, ignoring: %m");
4764 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that
4765 * we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any
4766 * more. We do keep exec_fd however, if we have it, since we need to keep it open until the final
4769 r
= close_all_fds(keep_fds
, n_keep_fds
);
4771 r
= shift_fds(fds
, n_fds
);
4773 r
= flags_fds(fds
, n_socket_fds
, n_fds
, context
->non_blocking
);
4775 *exit_status
= EXIT_FDS
;
4776 return log_unit_error_errno(unit
, r
, "Failed to adjust passed file descriptors: %m");
4779 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4780 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4781 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4784 secure_bits
= context
->secure_bits
;
4786 if (needs_sandboxing
) {
4789 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested.
4790 * (Note this is placed after the general resource limit initialization, see above, in order
4791 * to take precedence.) */
4792 if (context
->restrict_realtime
&& !context
->rlimit
[RLIMIT_RTPRIO
]) {
4793 if (setrlimit(RLIMIT_RTPRIO
, &RLIMIT_MAKE_CONST(0)) < 0) {
4794 *exit_status
= EXIT_LIMITS
;
4795 return log_unit_error_errno(unit
, errno
, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
4800 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4801 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4803 r
= setup_smack(unit
->manager
, context
, executable_fd
);
4804 if (r
< 0 && !context
->smack_process_label_ignore
) {
4805 *exit_status
= EXIT_SMACK_PROCESS_LABEL
;
4806 return log_unit_error_errno(unit
, r
, "Failed to set SMACK process label: %m");
4811 bset
= context
->capability_bounding_set
;
4812 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4813 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4814 * instead of us doing that */
4815 if (needs_ambient_hack
)
4816 bset
|= (UINT64_C(1) << CAP_SETPCAP
) |
4817 (UINT64_C(1) << CAP_SETUID
) |
4818 (UINT64_C(1) << CAP_SETGID
);
4820 if (!cap_test_all(bset
)) {
4821 r
= capability_bounding_set_drop(bset
, /* right_now= */ false);
4823 *exit_status
= EXIT_CAPABILITIES
;
4824 return log_unit_error_errno(unit
, r
, "Failed to drop capabilities: %m");
4828 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4831 * To be able to raise the ambient capabilities after setresuid() they have to be added to
4832 * the inherited set and keep caps has to be set (done in enforce_user()). After setresuid()
4833 * the ambient capabilities can be raised as they are present in the permitted and
4834 * inhertiable set. However it is possible that someone wants to set ambient capabilities
4835 * without changing the user, so we also set the ambient capabilities here.
4837 * The requested ambient capabilities are raised in the inheritable set if the second
4838 * argument is true. */
4839 if (!needs_ambient_hack
) {
4840 r
= capability_ambient_set_apply(capability_ambient_set
, /* also_inherit= */ true);
4842 *exit_status
= EXIT_CAPABILITIES
;
4843 return log_unit_error_errno(unit
, r
, "Failed to apply ambient capabilities (before UID change): %m");
4848 /* chroot to root directory first, before we lose the ability to chroot */
4849 r
= apply_root_directory(context
, params
, runtime
, needs_mount_namespace
, exit_status
);
4851 return log_unit_error_errno(unit
, r
, "Chrooting to the requested root directory failed: %m");
4854 if (uid_is_valid(uid
)) {
4855 r
= enforce_user(context
, uid
, capability_ambient_set
);
4857 *exit_status
= EXIT_USER
;
4858 return log_unit_error_errno(unit
, r
, "Failed to change UID to " UID_FMT
": %m", uid
);
4861 if (!needs_ambient_hack
&& capability_ambient_set
!= 0) {
4863 /* Raise the ambient capabilities after user change. */
4864 r
= capability_ambient_set_apply(capability_ambient_set
, /* also_inherit= */ false);
4866 *exit_status
= EXIT_CAPABILITIES
;
4867 return log_unit_error_errno(unit
, r
, "Failed to apply ambient capabilities (after UID change): %m");
4873 /* Apply working directory here, because the working directory might be on NFS and only the user running
4874 * this service might have the correct privilege to change to the working directory */
4875 r
= apply_working_directory(context
, params
, runtime
, home
, exit_status
);
4877 return log_unit_error_errno(unit
, r
, "Changing to the requested working directory failed: %m");
4879 if (needs_sandboxing
) {
4880 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
4881 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
4882 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
4883 * are restricted. */
4887 char *exec_context
= mac_selinux_context_net
?: context
->selinux_context
;
4890 r
= setexeccon(exec_context
);
4892 if (!context
->selinux_context_ignore
) {
4893 *exit_status
= EXIT_SELINUX_CONTEXT
;
4894 return log_unit_error_errno(unit
, r
, "Failed to change SELinux context to %s: %m", exec_context
);
4896 log_unit_debug_errno(unit
, r
, "Failed to change SELinux context to %s, ignoring: %m", exec_context
);
4903 if (use_apparmor
&& context
->apparmor_profile
) {
4904 r
= aa_change_onexec(context
->apparmor_profile
);
4905 if (r
< 0 && !context
->apparmor_profile_ignore
) {
4906 *exit_status
= EXIT_APPARMOR_PROFILE
;
4907 return log_unit_error_errno(unit
, errno
, "Failed to prepare AppArmor profile change to %s: %m", context
->apparmor_profile
);
4912 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential
4913 * EPERMs we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits
4914 * requires CAP_SETPCAP. */
4915 if (prctl(PR_GET_SECUREBITS
) != secure_bits
) {
4916 /* CAP_SETPCAP is required to set securebits. This capability is raised into the
4917 * effective set here.
4919 * The effective set is overwritten during execve() with the following values:
4921 * - ambient set (for non-root processes)
4923 * - (inheritable | bounding) set for root processes)
4925 * Hence there is no security impact to raise it in the effective set before execve
4927 r
= capability_gain_cap_setpcap(/* return_caps= */ NULL
);
4929 *exit_status
= EXIT_CAPABILITIES
;
4930 return log_unit_error_errno(unit
, r
, "Failed to gain CAP_SETPCAP for setting secure bits");
4932 if (prctl(PR_SET_SECUREBITS
, secure_bits
) < 0) {
4933 *exit_status
= EXIT_SECUREBITS
;
4934 return log_unit_error_errno(unit
, errno
, "Failed to set process secure bits: %m");
4938 if (context_has_no_new_privileges(context
))
4939 if (prctl(PR_SET_NO_NEW_PRIVS
, 1, 0, 0, 0) < 0) {
4940 *exit_status
= EXIT_NO_NEW_PRIVILEGES
;
4941 return log_unit_error_errno(unit
, errno
, "Failed to disable new privileges: %m");
4945 r
= apply_address_families(unit
, context
);
4947 *exit_status
= EXIT_ADDRESS_FAMILIES
;
4948 return log_unit_error_errno(unit
, r
, "Failed to restrict address families: %m");
4951 r
= apply_memory_deny_write_execute(unit
, context
);
4953 *exit_status
= EXIT_SECCOMP
;
4954 return log_unit_error_errno(unit
, r
, "Failed to disable writing to executable memory: %m");
4957 r
= apply_restrict_realtime(unit
, context
);
4959 *exit_status
= EXIT_SECCOMP
;
4960 return log_unit_error_errno(unit
, r
, "Failed to apply realtime restrictions: %m");
4963 r
= apply_restrict_suid_sgid(unit
, context
);
4965 *exit_status
= EXIT_SECCOMP
;
4966 return log_unit_error_errno(unit
, r
, "Failed to apply SUID/SGID restrictions: %m");
4969 r
= apply_restrict_namespaces(unit
, context
);
4971 *exit_status
= EXIT_SECCOMP
;
4972 return log_unit_error_errno(unit
, r
, "Failed to apply namespace restrictions: %m");
4975 r
= apply_protect_sysctl(unit
, context
);
4977 *exit_status
= EXIT_SECCOMP
;
4978 return log_unit_error_errno(unit
, r
, "Failed to apply sysctl restrictions: %m");
4981 r
= apply_protect_kernel_modules(unit
, context
);
4983 *exit_status
= EXIT_SECCOMP
;
4984 return log_unit_error_errno(unit
, r
, "Failed to apply module loading restrictions: %m");
4987 r
= apply_protect_kernel_logs(unit
, context
);
4989 *exit_status
= EXIT_SECCOMP
;
4990 return log_unit_error_errno(unit
, r
, "Failed to apply kernel log restrictions: %m");
4993 r
= apply_protect_clock(unit
, context
);
4995 *exit_status
= EXIT_SECCOMP
;
4996 return log_unit_error_errno(unit
, r
, "Failed to apply clock restrictions: %m");
4999 r
= apply_private_devices(unit
, context
);
5001 *exit_status
= EXIT_SECCOMP
;
5002 return log_unit_error_errno(unit
, r
, "Failed to set up private devices: %m");
5005 r
= apply_syscall_archs(unit
, context
);
5007 *exit_status
= EXIT_SECCOMP
;
5008 return log_unit_error_errno(unit
, r
, "Failed to apply syscall architecture restrictions: %m");
5011 r
= apply_lock_personality(unit
, context
);
5013 *exit_status
= EXIT_SECCOMP
;
5014 return log_unit_error_errno(unit
, r
, "Failed to lock personalities: %m");
5017 r
= apply_syscall_log(unit
, context
);
5019 *exit_status
= EXIT_SECCOMP
;
5020 return log_unit_error_errno(unit
, r
, "Failed to apply system call log filters: %m");
5023 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
5024 * by the filter as little as possible. */
5025 r
= apply_syscall_filter(unit
, context
, needs_ambient_hack
);
5027 *exit_status
= EXIT_SECCOMP
;
5028 return log_unit_error_errno(unit
, r
, "Failed to apply system call filters: %m");
5033 r
= apply_restrict_filesystems(unit
, context
);
5035 *exit_status
= EXIT_BPF
;
5036 return log_unit_error_errno(unit
, r
, "Failed to restrict filesystems: %m");
5042 if (!strv_isempty(context
->unset_environment
)) {
5045 ee
= strv_env_delete(accum_env
, 1, context
->unset_environment
);
5047 *exit_status
= EXIT_MEMORY
;
5051 strv_free_and_replace(accum_env
, ee
);
5054 if (!FLAGS_SET(command
->flags
, EXEC_COMMAND_NO_ENV_EXPAND
)) {
5055 _cleanup_strv_free_
char **unset_variables
= NULL
, **bad_variables
= NULL
;
5057 r
= replace_env_argv(command
->argv
, accum_env
, &replaced_argv
, &unset_variables
, &bad_variables
);
5059 *exit_status
= EXIT_MEMORY
;
5060 return log_unit_error_errno(unit
, r
, "Failed to replace environment variables: %m");
5062 final_argv
= replaced_argv
;
5064 if (!strv_isempty(unset_variables
)) {
5065 _cleanup_free_
char *ju
= strv_join(unset_variables
, ", ");
5066 log_unit_warning(unit
, "Referenced but unset environment variable evaluates to an empty string: %s", strna(ju
));
5069 if (!strv_isempty(bad_variables
)) {
5070 _cleanup_free_
char *jb
= strv_join(bad_variables
, ", ");
5071 log_unit_warning(unit
, "Invalid environment variable name evaluates to an empty string: %s", strna(jb
));;
5074 final_argv
= command
->argv
;
5076 log_command_line(unit
, "Executing", executable
, final_argv
);
5081 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5082 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5084 if (write(exec_fd
, &hot
, sizeof(hot
)) < 0) {
5085 *exit_status
= EXIT_EXEC
;
5086 return log_unit_error_errno(unit
, errno
, "Failed to enable exec_fd: %m");
5090 r
= fexecve_or_execve(executable_fd
, executable
, final_argv
, accum_env
);
5095 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5096 * that POLLHUP on it no longer means execve() succeeded. */
5098 if (write(exec_fd
, &hot
, sizeof(hot
)) < 0) {
5099 *exit_status
= EXIT_EXEC
;
5100 return log_unit_error_errno(unit
, errno
, "Failed to disable exec_fd: %m");
5104 *exit_status
= EXIT_EXEC
;
5105 return log_unit_error_errno(unit
, r
, "Failed to execute %s: %m", executable
);
5108 static int exec_context_load_environment(const Unit
*unit
, const ExecContext
*c
, char ***l
);
5109 static int exec_context_named_iofds(const ExecContext
*c
, const ExecParameters
*p
, int named_iofds
[static 3]);
5111 int exec_spawn(Unit
*unit
,
5112 ExecCommand
*command
,
5113 const ExecContext
*context
,
5114 const ExecParameters
*params
,
5115 ExecRuntime
*runtime
,
5116 const CGroupContext
*cgroup_context
,
5119 int socket_fd
, r
, named_iofds
[3] = { -1, -1, -1 }, *fds
= NULL
;
5120 _cleanup_free_
char *subcgroup_path
= NULL
;
5121 _cleanup_strv_free_
char **files_env
= NULL
;
5122 size_t n_storage_fds
= 0, n_socket_fds
= 0;
5130 assert(params
->fds
|| (params
->n_socket_fds
+ params
->n_storage_fds
<= 0));
5132 LOG_CONTEXT_PUSH_UNIT(unit
);
5134 if (context
->std_input
== EXEC_INPUT_SOCKET
||
5135 context
->std_output
== EXEC_OUTPUT_SOCKET
||
5136 context
->std_error
== EXEC_OUTPUT_SOCKET
) {
5138 if (params
->n_socket_fds
> 1)
5139 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(EINVAL
), "Got more than one socket.");
5141 if (params
->n_socket_fds
== 0)
5142 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(EINVAL
), "Got no socket.");
5144 socket_fd
= params
->fds
[0];
5148 n_socket_fds
= params
->n_socket_fds
;
5149 n_storage_fds
= params
->n_storage_fds
;
5152 r
= exec_context_named_iofds(context
, params
, named_iofds
);
5154 return log_unit_error_errno(unit
, r
, "Failed to load a named file descriptor: %m");
5156 r
= exec_context_load_environment(unit
, context
, &files_env
);
5158 return log_unit_error_errno(unit
, r
, "Failed to load environment files: %m");
5160 /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
5161 and, until the next SELinux policy changes, we save further reloads in future children. */
5162 mac_selinux_maybe_reload();
5164 /* We won't know the real executable path until we create the mount namespace in the child, but we
5165 want to log from the parent, so we use the possibly inaccurate path here. */
5166 log_command_line(unit
, "About to execute", command
->path
, command
->argv
);
5168 if (params
->cgroup_path
) {
5169 r
= exec_parameters_get_cgroup_path(params
, cgroup_context
, &subcgroup_path
);
5171 return log_unit_error_errno(unit
, r
, "Failed to acquire subcgroup path: %m");
5173 /* If there's a subcgroup, then let's create it here now (the main cgroup was already
5174 * realized by the unit logic) */
5176 r
= cg_create(SYSTEMD_CGROUP_CONTROLLER
, subcgroup_path
);
5178 return log_unit_error_errno(unit
, r
, "Failed to create subcgroup '%s': %m", subcgroup_path
);
5184 return log_unit_error_errno(unit
, errno
, "Failed to fork: %m");
5189 r
= exec_child(unit
,
5201 unit
->manager
->user_lookup_fds
[1],
5205 const char *status
= ASSERT_PTR(
5206 exit_status_to_string(exit_status
, EXIT_STATUS_LIBC
| EXIT_STATUS_SYSTEMD
));
5208 log_unit_struct_errno(unit
, LOG_ERR
, r
,
5209 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
5210 LOG_UNIT_INVOCATION_ID(unit
),
5211 LOG_UNIT_MESSAGE(unit
, "Failed at step %s spawning %s: %m",
5212 status
, command
->path
),
5213 "EXECUTABLE=%s", command
->path
);
5215 assert(exit_status
== EXIT_SUCCESS
);
5220 log_unit_debug(unit
, "Forked %s as "PID_FMT
, command
->path
, pid
);
5222 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
5223 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
5224 * process will be killed too). */
5226 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER
, subcgroup_path
, pid
);
5228 exec_status_start(&command
->exec_status
, pid
);
5234 void exec_context_init(ExecContext
*c
) {
5238 c
->ioprio
= IOPRIO_DEFAULT_CLASS_AND_PRIO
;
5239 c
->cpu_sched_policy
= SCHED_OTHER
;
5240 c
->syslog_priority
= LOG_DAEMON
|LOG_INFO
;
5241 c
->syslog_level_prefix
= true;
5242 c
->ignore_sigpipe
= true;
5243 c
->timer_slack_nsec
= NSEC_INFINITY
;
5244 c
->personality
= PERSONALITY_INVALID
;
5245 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++)
5246 c
->directories
[t
].mode
= 0755;
5247 c
->timeout_clean_usec
= USEC_INFINITY
;
5248 c
->capability_bounding_set
= CAP_MASK_UNSET
;
5249 assert_cc(NAMESPACE_FLAGS_INITIAL
!= NAMESPACE_FLAGS_ALL
);
5250 c
->restrict_namespaces
= NAMESPACE_FLAGS_INITIAL
;
5251 c
->log_level_max
= -1;
5253 c
->syscall_errno
= SECCOMP_ERROR_NUMBER_KILL
;
5255 c
->tty_rows
= UINT_MAX
;
5256 c
->tty_cols
= UINT_MAX
;
5257 numa_policy_reset(&c
->numa_policy
);
5258 c
->private_mounts
= -1;
5262 void exec_context_done(ExecContext
*c
) {
5265 c
->environment
= strv_free(c
->environment
);
5266 c
->environment_files
= strv_free(c
->environment_files
);
5267 c
->pass_environment
= strv_free(c
->pass_environment
);
5268 c
->unset_environment
= strv_free(c
->unset_environment
);
5270 rlimit_free_all(c
->rlimit
);
5272 for (size_t l
= 0; l
< 3; l
++) {
5273 c
->stdio_fdname
[l
] = mfree(c
->stdio_fdname
[l
]);
5274 c
->stdio_file
[l
] = mfree(c
->stdio_file
[l
]);
5277 c
->working_directory
= mfree(c
->working_directory
);
5278 c
->root_directory
= mfree(c
->root_directory
);
5279 c
->root_image
= mfree(c
->root_image
);
5280 c
->root_image_options
= mount_options_free_all(c
->root_image_options
);
5281 c
->root_hash
= mfree(c
->root_hash
);
5282 c
->root_hash_size
= 0;
5283 c
->root_hash_path
= mfree(c
->root_hash_path
);
5284 c
->root_hash_sig
= mfree(c
->root_hash_sig
);
5285 c
->root_hash_sig_size
= 0;
5286 c
->root_hash_sig_path
= mfree(c
->root_hash_sig_path
);
5287 c
->root_verity
= mfree(c
->root_verity
);
5288 c
->extension_images
= mount_image_free_many(c
->extension_images
, &c
->n_extension_images
);
5289 c
->extension_directories
= strv_free(c
->extension_directories
);
5290 c
->tty_path
= mfree(c
->tty_path
);
5291 c
->syslog_identifier
= mfree(c
->syslog_identifier
);
5292 c
->user
= mfree(c
->user
);
5293 c
->group
= mfree(c
->group
);
5295 c
->supplementary_groups
= strv_free(c
->supplementary_groups
);
5297 c
->pam_name
= mfree(c
->pam_name
);
5299 c
->read_only_paths
= strv_free(c
->read_only_paths
);
5300 c
->read_write_paths
= strv_free(c
->read_write_paths
);
5301 c
->inaccessible_paths
= strv_free(c
->inaccessible_paths
);
5302 c
->exec_paths
= strv_free(c
->exec_paths
);
5303 c
->no_exec_paths
= strv_free(c
->no_exec_paths
);
5304 c
->exec_search_path
= strv_free(c
->exec_search_path
);
5306 bind_mount_free_many(c
->bind_mounts
, c
->n_bind_mounts
);
5307 c
->bind_mounts
= NULL
;
5308 c
->n_bind_mounts
= 0;
5309 temporary_filesystem_free_many(c
->temporary_filesystems
, c
->n_temporary_filesystems
);
5310 c
->temporary_filesystems
= NULL
;
5311 c
->n_temporary_filesystems
= 0;
5312 c
->mount_images
= mount_image_free_many(c
->mount_images
, &c
->n_mount_images
);
5314 cpu_set_reset(&c
->cpu_set
);
5315 numa_policy_reset(&c
->numa_policy
);
5317 c
->utmp_id
= mfree(c
->utmp_id
);
5318 c
->selinux_context
= mfree(c
->selinux_context
);
5319 c
->apparmor_profile
= mfree(c
->apparmor_profile
);
5320 c
->smack_process_label
= mfree(c
->smack_process_label
);
5322 c
->restrict_filesystems
= set_free_free(c
->restrict_filesystems
);
5324 c
->syscall_filter
= hashmap_free(c
->syscall_filter
);
5325 c
->syscall_archs
= set_free(c
->syscall_archs
);
5326 c
->address_families
= set_free(c
->address_families
);
5328 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++)
5329 exec_directory_done(&c
->directories
[t
]);
5331 c
->log_level_max
= -1;
5333 exec_context_free_log_extra_fields(c
);
5334 c
->log_filter_allowed_patterns
= set_free_free(c
->log_filter_allowed_patterns
);
5335 c
->log_filter_denied_patterns
= set_free_free(c
->log_filter_denied_patterns
);
5337 c
->log_ratelimit_interval_usec
= 0;
5338 c
->log_ratelimit_burst
= 0;
5340 c
->stdin_data
= mfree(c
->stdin_data
);
5341 c
->stdin_data_size
= 0;
5343 c
->network_namespace_path
= mfree(c
->network_namespace_path
);
5344 c
->ipc_namespace_path
= mfree(c
->ipc_namespace_path
);
5346 c
->log_namespace
= mfree(c
->log_namespace
);
5348 c
->load_credentials
= hashmap_free(c
->load_credentials
);
5349 c
->set_credentials
= hashmap_free(c
->set_credentials
);
5350 c
->import_credentials
= set_free_free(c
->import_credentials
);
5352 c
->root_image_policy
= image_policy_free(c
->root_image_policy
);
5353 c
->mount_image_policy
= image_policy_free(c
->mount_image_policy
);
5354 c
->extension_image_policy
= image_policy_free(c
->extension_image_policy
);
5357 int exec_context_destroy_runtime_directory(const ExecContext
*c
, const char *runtime_prefix
) {
5360 if (!runtime_prefix
)
5363 for (size_t i
= 0; i
< c
->directories
[EXEC_DIRECTORY_RUNTIME
].n_items
; i
++) {
5364 _cleanup_free_
char *p
= NULL
;
5366 if (exec_directory_is_private(c
, EXEC_DIRECTORY_RUNTIME
))
5367 p
= path_join(runtime_prefix
, "private", c
->directories
[EXEC_DIRECTORY_RUNTIME
].items
[i
].path
);
5369 p
= path_join(runtime_prefix
, c
->directories
[EXEC_DIRECTORY_RUNTIME
].items
[i
].path
);
5373 /* We execute this synchronously, since we need to be sure this is gone when we start the
5375 (void) rm_rf(p
, REMOVE_ROOT
);
5377 STRV_FOREACH(symlink
, c
->directories
[EXEC_DIRECTORY_RUNTIME
].items
[i
].symlinks
) {
5378 _cleanup_free_
char *symlink_abs
= NULL
;
5380 if (exec_directory_is_private(c
, EXEC_DIRECTORY_RUNTIME
))
5381 symlink_abs
= path_join(runtime_prefix
, "private", *symlink
);
5383 symlink_abs
= path_join(runtime_prefix
, *symlink
);
5387 (void) unlink(symlink_abs
);
5394 int exec_context_destroy_mount_ns_dir(Unit
*u
) {
5395 _cleanup_free_
char *p
= NULL
;
5397 if (!u
|| !MANAGER_IS_SYSTEM(u
->manager
))
5400 p
= path_join("/run/systemd/propagate/", u
->id
);
5404 /* This is only filled transiently (see mount_in_namespace()), should be empty or even non-existent*/
5405 if (rmdir(p
) < 0 && errno
!= ENOENT
)
5406 log_unit_debug_errno(u
, errno
, "Unable to remove propagation dir '%s', ignoring: %m", p
);
5411 static void exec_command_done(ExecCommand
*c
) {
5414 c
->path
= mfree(c
->path
);
5415 c
->argv
= strv_free(c
->argv
);
5418 void exec_command_done_array(ExecCommand
*c
, size_t n
) {
5419 for (size_t i
= 0; i
< n
; i
++)
5420 exec_command_done(c
+i
);
5423 ExecCommand
* exec_command_free_list(ExecCommand
*c
) {
5426 while ((i
= LIST_POP(command
, c
))) {
5427 exec_command_done(i
);
5434 void exec_command_free_array(ExecCommand
**c
, size_t n
) {
5435 for (size_t i
= 0; i
< n
; i
++)
5436 c
[i
] = exec_command_free_list(c
[i
]);
5439 void exec_command_reset_status_array(ExecCommand
*c
, size_t n
) {
5440 for (size_t i
= 0; i
< n
; i
++)
5441 exec_status_reset(&c
[i
].exec_status
);
5444 void exec_command_reset_status_list_array(ExecCommand
**c
, size_t n
) {
5445 for (size_t i
= 0; i
< n
; i
++)
5446 LIST_FOREACH(command
, z
, c
[i
])
5447 exec_status_reset(&z
->exec_status
);
5450 typedef struct InvalidEnvInfo
{
5455 static void invalid_env(const char *p
, void *userdata
) {
5456 InvalidEnvInfo
*info
= userdata
;
5458 log_unit_error(info
->unit
, "Ignoring invalid environment assignment '%s': %s", p
, info
->path
);
5461 const char* exec_context_fdname(const ExecContext
*c
, int fd_index
) {
5467 if (c
->std_input
!= EXEC_INPUT_NAMED_FD
)
5470 return c
->stdio_fdname
[STDIN_FILENO
] ?: "stdin";
5473 if (c
->std_output
!= EXEC_OUTPUT_NAMED_FD
)
5476 return c
->stdio_fdname
[STDOUT_FILENO
] ?: "stdout";
5479 if (c
->std_error
!= EXEC_OUTPUT_NAMED_FD
)
5482 return c
->stdio_fdname
[STDERR_FILENO
] ?: "stderr";
5489 static int exec_context_named_iofds(
5490 const ExecContext
*c
,
5491 const ExecParameters
*p
,
5492 int named_iofds
[static 3]) {
5495 const char* stdio_fdname
[3];
5500 assert(named_iofds
);
5502 targets
= (c
->std_input
== EXEC_INPUT_NAMED_FD
) +
5503 (c
->std_output
== EXEC_OUTPUT_NAMED_FD
) +
5504 (c
->std_error
== EXEC_OUTPUT_NAMED_FD
);
5506 for (size_t i
= 0; i
< 3; i
++)
5507 stdio_fdname
[i
] = exec_context_fdname(c
, i
);
5509 n_fds
= p
->n_storage_fds
+ p
->n_socket_fds
;
5511 for (size_t i
= 0; i
< n_fds
&& targets
> 0; i
++)
5512 if (named_iofds
[STDIN_FILENO
] < 0 &&
5513 c
->std_input
== EXEC_INPUT_NAMED_FD
&&
5514 stdio_fdname
[STDIN_FILENO
] &&
5515 streq(p
->fd_names
[i
], stdio_fdname
[STDIN_FILENO
])) {
5517 named_iofds
[STDIN_FILENO
] = p
->fds
[i
];
5520 } else if (named_iofds
[STDOUT_FILENO
] < 0 &&
5521 c
->std_output
== EXEC_OUTPUT_NAMED_FD
&&
5522 stdio_fdname
[STDOUT_FILENO
] &&
5523 streq(p
->fd_names
[i
], stdio_fdname
[STDOUT_FILENO
])) {
5525 named_iofds
[STDOUT_FILENO
] = p
->fds
[i
];
5528 } else if (named_iofds
[STDERR_FILENO
] < 0 &&
5529 c
->std_error
== EXEC_OUTPUT_NAMED_FD
&&
5530 stdio_fdname
[STDERR_FILENO
] &&
5531 streq(p
->fd_names
[i
], stdio_fdname
[STDERR_FILENO
])) {
5533 named_iofds
[STDERR_FILENO
] = p
->fds
[i
];
5537 return targets
== 0 ? 0 : -ENOENT
;
5540 static int exec_context_load_environment(const Unit
*unit
, const ExecContext
*c
, char ***ret
) {
5541 _cleanup_strv_free_
char **v
= NULL
;
5547 STRV_FOREACH(i
, c
->environment_files
) {
5548 _cleanup_globfree_ glob_t pglob
= {};
5549 bool ignore
= false;
5557 if (!path_is_absolute(fn
)) {
5563 /* Filename supports globbing, take all matching files */
5564 r
= safe_glob(fn
, 0, &pglob
);
5571 /* When we don't match anything, -ENOENT should be returned */
5572 assert(pglob
.gl_pathc
> 0);
5574 for (size_t n
= 0; n
< pglob
.gl_pathc
; n
++) {
5575 _cleanup_strv_free_
char **p
= NULL
;
5577 r
= load_env_file(NULL
, pglob
.gl_pathv
[n
], &p
);
5584 /* Log invalid environment variables with filename */
5586 InvalidEnvInfo info
= {
5588 .path
= pglob
.gl_pathv
[n
]
5591 p
= strv_env_clean_with_callback(p
, invalid_env
, &info
);
5597 char **m
= strv_env_merge(v
, p
);
5601 strv_free_and_replace(v
, m
);
5611 static bool tty_may_match_dev_console(const char *tty
) {
5612 _cleanup_free_
char *resolved
= NULL
;
5617 tty
= skip_dev_prefix(tty
);
5619 /* trivial identity? */
5620 if (streq(tty
, "console"))
5623 if (resolve_dev_console(&resolved
) < 0)
5624 return true; /* if we could not resolve, assume it may */
5626 /* "tty0" means the active VC, so it may be the same sometimes */
5627 return path_equal(resolved
, tty
) || (streq(resolved
, "tty0") && tty_is_vc(tty
));
5630 static bool exec_context_may_touch_tty(const ExecContext
*ec
) {
5633 return ec
->tty_reset
||
5635 ec
->tty_vt_disallocate
||
5636 is_terminal_input(ec
->std_input
) ||
5637 is_terminal_output(ec
->std_output
) ||
5638 is_terminal_output(ec
->std_error
);
5641 bool exec_context_may_touch_console(const ExecContext
*ec
) {
5643 return exec_context_may_touch_tty(ec
) &&
5644 tty_may_match_dev_console(exec_context_tty_path(ec
));
5647 static void strv_fprintf(FILE *f
, char **l
) {
5651 fprintf(f
, " %s", *g
);
5654 static void strv_dump(FILE* f
, const char *prefix
, const char *name
, char **strv
) {
5659 if (!strv_isempty(strv
)) {
5660 fprintf(f
, "%s%s:", prefix
, name
);
5661 strv_fprintf(f
, strv
);
5666 void exec_context_dump(const ExecContext
*c
, FILE* f
, const char *prefix
) {
5672 prefix
= strempty(prefix
);
5676 "%sWorkingDirectory: %s\n"
5677 "%sRootDirectory: %s\n"
5678 "%sRootEphemeral: %s\n"
5679 "%sNonBlocking: %s\n"
5680 "%sPrivateTmp: %s\n"
5681 "%sPrivateDevices: %s\n"
5682 "%sProtectKernelTunables: %s\n"
5683 "%sProtectKernelModules: %s\n"
5684 "%sProtectKernelLogs: %s\n"
5685 "%sProtectClock: %s\n"
5686 "%sProtectControlGroups: %s\n"
5687 "%sPrivateNetwork: %s\n"
5688 "%sPrivateUsers: %s\n"
5689 "%sProtectHome: %s\n"
5690 "%sProtectSystem: %s\n"
5691 "%sMountAPIVFS: %s\n"
5692 "%sIgnoreSIGPIPE: %s\n"
5693 "%sMemoryDenyWriteExecute: %s\n"
5694 "%sRestrictRealtime: %s\n"
5695 "%sRestrictSUIDSGID: %s\n"
5696 "%sKeyringMode: %s\n"
5697 "%sProtectHostname: %s\n"
5698 "%sProtectProc: %s\n"
5699 "%sProcSubset: %s\n",
5701 prefix
, empty_to_root(c
->working_directory
),
5702 prefix
, empty_to_root(c
->root_directory
),
5703 prefix
, yes_no(c
->root_ephemeral
),
5704 prefix
, yes_no(c
->non_blocking
),
5705 prefix
, yes_no(c
->private_tmp
),
5706 prefix
, yes_no(c
->private_devices
),
5707 prefix
, yes_no(c
->protect_kernel_tunables
),
5708 prefix
, yes_no(c
->protect_kernel_modules
),
5709 prefix
, yes_no(c
->protect_kernel_logs
),
5710 prefix
, yes_no(c
->protect_clock
),
5711 prefix
, yes_no(c
->protect_control_groups
),
5712 prefix
, yes_no(c
->private_network
),
5713 prefix
, yes_no(c
->private_users
),
5714 prefix
, protect_home_to_string(c
->protect_home
),
5715 prefix
, protect_system_to_string(c
->protect_system
),
5716 prefix
, yes_no(exec_context_get_effective_mount_apivfs(c
)),
5717 prefix
, yes_no(c
->ignore_sigpipe
),
5718 prefix
, yes_no(c
->memory_deny_write_execute
),
5719 prefix
, yes_no(c
->restrict_realtime
),
5720 prefix
, yes_no(c
->restrict_suid_sgid
),
5721 prefix
, exec_keyring_mode_to_string(c
->keyring_mode
),
5722 prefix
, yes_no(c
->protect_hostname
),
5723 prefix
, protect_proc_to_string(c
->protect_proc
),
5724 prefix
, proc_subset_to_string(c
->proc_subset
));
5727 fprintf(f
, "%sRootImage: %s\n", prefix
, c
->root_image
);
5729 if (c
->root_image_options
) {
5730 fprintf(f
, "%sRootImageOptions:", prefix
);
5731 LIST_FOREACH(mount_options
, o
, c
->root_image_options
)
5732 if (!isempty(o
->options
))
5733 fprintf(f
, " %s:%s",
5734 partition_designator_to_string(o
->partition_designator
),
5740 _cleanup_free_
char *encoded
= NULL
;
5741 encoded
= hexmem(c
->root_hash
, c
->root_hash_size
);
5743 fprintf(f
, "%sRootHash: %s\n", prefix
, encoded
);
5746 if (c
->root_hash_path
)
5747 fprintf(f
, "%sRootHash: %s\n", prefix
, c
->root_hash_path
);
5749 if (c
->root_hash_sig
) {
5750 _cleanup_free_
char *encoded
= NULL
;
5752 len
= base64mem(c
->root_hash_sig
, c
->root_hash_sig_size
, &encoded
);
5754 fprintf(f
, "%sRootHashSignature: base64:%s\n", prefix
, encoded
);
5757 if (c
->root_hash_sig_path
)
5758 fprintf(f
, "%sRootHashSignature: %s\n", prefix
, c
->root_hash_sig_path
);
5761 fprintf(f
, "%sRootVerity: %s\n", prefix
, c
->root_verity
);
5763 STRV_FOREACH(e
, c
->environment
)
5764 fprintf(f
, "%sEnvironment: %s\n", prefix
, *e
);
5766 STRV_FOREACH(e
, c
->environment_files
)
5767 fprintf(f
, "%sEnvironmentFile: %s\n", prefix
, *e
);
5769 STRV_FOREACH(e
, c
->pass_environment
)
5770 fprintf(f
, "%sPassEnvironment: %s\n", prefix
, *e
);
5772 STRV_FOREACH(e
, c
->unset_environment
)
5773 fprintf(f
, "%sUnsetEnvironment: %s\n", prefix
, *e
);
5775 fprintf(f
, "%sRuntimeDirectoryPreserve: %s\n", prefix
, exec_preserve_mode_to_string(c
->runtime_directory_preserve_mode
));
5777 for (ExecDirectoryType dt
= 0; dt
< _EXEC_DIRECTORY_TYPE_MAX
; dt
++) {
5778 fprintf(f
, "%s%sMode: %04o\n", prefix
, exec_directory_type_to_string(dt
), c
->directories
[dt
].mode
);
5780 for (size_t i
= 0; i
< c
->directories
[dt
].n_items
; i
++) {
5781 fprintf(f
, "%s%s: %s\n", prefix
, exec_directory_type_to_string(dt
), c
->directories
[dt
].items
[i
].path
);
5783 STRV_FOREACH(d
, c
->directories
[dt
].items
[i
].symlinks
)
5784 fprintf(f
, "%s%s: %s:%s\n", prefix
, exec_directory_type_symlink_to_string(dt
), c
->directories
[dt
].items
[i
].path
, *d
);
5788 fprintf(f
, "%sTimeoutCleanSec: %s\n", prefix
, FORMAT_TIMESPAN(c
->timeout_clean_usec
, USEC_PER_SEC
));
5791 fprintf(f
, "%sNice: %i\n", prefix
, c
->nice
);
5793 if (c
->oom_score_adjust_set
)
5794 fprintf(f
, "%sOOMScoreAdjust: %i\n", prefix
, c
->oom_score_adjust
);
5796 if (c
->coredump_filter_set
)
5797 fprintf(f
, "%sCoredumpFilter: 0x%"PRIx64
"\n", prefix
, c
->coredump_filter
);
5799 for (unsigned i
= 0; i
< RLIM_NLIMITS
; i
++)
5801 fprintf(f
, "%sLimit%s: " RLIM_FMT
"\n",
5802 prefix
, rlimit_to_string(i
), c
->rlimit
[i
]->rlim_max
);
5803 fprintf(f
, "%sLimit%sSoft: " RLIM_FMT
"\n",
5804 prefix
, rlimit_to_string(i
), c
->rlimit
[i
]->rlim_cur
);
5807 if (c
->ioprio_set
) {
5808 _cleanup_free_
char *class_str
= NULL
;
5810 r
= ioprio_class_to_string_alloc(ioprio_prio_class(c
->ioprio
), &class_str
);
5812 fprintf(f
, "%sIOSchedulingClass: %s\n", prefix
, class_str
);
5814 fprintf(f
, "%sIOPriority: %d\n", prefix
, ioprio_prio_data(c
->ioprio
));
5817 if (c
->cpu_sched_set
) {
5818 _cleanup_free_
char *policy_str
= NULL
;
5820 r
= sched_policy_to_string_alloc(c
->cpu_sched_policy
, &policy_str
);
5822 fprintf(f
, "%sCPUSchedulingPolicy: %s\n", prefix
, policy_str
);
5825 "%sCPUSchedulingPriority: %i\n"
5826 "%sCPUSchedulingResetOnFork: %s\n",
5827 prefix
, c
->cpu_sched_priority
,
5828 prefix
, yes_no(c
->cpu_sched_reset_on_fork
));
5831 if (c
->cpu_set
.set
) {
5832 _cleanup_free_
char *affinity
= NULL
;
5834 affinity
= cpu_set_to_range_string(&c
->cpu_set
);
5835 fprintf(f
, "%sCPUAffinity: %s\n", prefix
, affinity
);
5838 if (mpol_is_valid(numa_policy_get_type(&c
->numa_policy
))) {
5839 _cleanup_free_
char *nodes
= NULL
;
5841 nodes
= cpu_set_to_range_string(&c
->numa_policy
.nodes
);
5842 fprintf(f
, "%sNUMAPolicy: %s\n", prefix
, mpol_to_string(numa_policy_get_type(&c
->numa_policy
)));
5843 fprintf(f
, "%sNUMAMask: %s\n", prefix
, strnull(nodes
));
5846 if (c
->timer_slack_nsec
!= NSEC_INFINITY
)
5847 fprintf(f
, "%sTimerSlackNSec: "NSEC_FMT
"\n", prefix
, c
->timer_slack_nsec
);
5850 "%sStandardInput: %s\n"
5851 "%sStandardOutput: %s\n"
5852 "%sStandardError: %s\n",
5853 prefix
, exec_input_to_string(c
->std_input
),
5854 prefix
, exec_output_to_string(c
->std_output
),
5855 prefix
, exec_output_to_string(c
->std_error
));
5857 if (c
->std_input
== EXEC_INPUT_NAMED_FD
)
5858 fprintf(f
, "%sStandardInputFileDescriptorName: %s\n", prefix
, c
->stdio_fdname
[STDIN_FILENO
]);
5859 if (c
->std_output
== EXEC_OUTPUT_NAMED_FD
)
5860 fprintf(f
, "%sStandardOutputFileDescriptorName: %s\n", prefix
, c
->stdio_fdname
[STDOUT_FILENO
]);
5861 if (c
->std_error
== EXEC_OUTPUT_NAMED_FD
)
5862 fprintf(f
, "%sStandardErrorFileDescriptorName: %s\n", prefix
, c
->stdio_fdname
[STDERR_FILENO
]);
5864 if (c
->std_input
== EXEC_INPUT_FILE
)
5865 fprintf(f
, "%sStandardInputFile: %s\n", prefix
, c
->stdio_file
[STDIN_FILENO
]);
5866 if (c
->std_output
== EXEC_OUTPUT_FILE
)
5867 fprintf(f
, "%sStandardOutputFile: %s\n", prefix
, c
->stdio_file
[STDOUT_FILENO
]);
5868 if (c
->std_output
== EXEC_OUTPUT_FILE_APPEND
)
5869 fprintf(f
, "%sStandardOutputFileToAppend: %s\n", prefix
, c
->stdio_file
[STDOUT_FILENO
]);
5870 if (c
->std_output
== EXEC_OUTPUT_FILE_TRUNCATE
)
5871 fprintf(f
, "%sStandardOutputFileToTruncate: %s\n", prefix
, c
->stdio_file
[STDOUT_FILENO
]);
5872 if (c
->std_error
== EXEC_OUTPUT_FILE
)
5873 fprintf(f
, "%sStandardErrorFile: %s\n", prefix
, c
->stdio_file
[STDERR_FILENO
]);
5874 if (c
->std_error
== EXEC_OUTPUT_FILE_APPEND
)
5875 fprintf(f
, "%sStandardErrorFileToAppend: %s\n", prefix
, c
->stdio_file
[STDERR_FILENO
]);
5876 if (c
->std_error
== EXEC_OUTPUT_FILE_TRUNCATE
)
5877 fprintf(f
, "%sStandardErrorFileToTruncate: %s\n", prefix
, c
->stdio_file
[STDERR_FILENO
]);
5883 "%sTTYVHangup: %s\n"
5884 "%sTTYVTDisallocate: %s\n"
5886 "%sTTYColumns: %u\n",
5887 prefix
, c
->tty_path
,
5888 prefix
, yes_no(c
->tty_reset
),
5889 prefix
, yes_no(c
->tty_vhangup
),
5890 prefix
, yes_no(c
->tty_vt_disallocate
),
5891 prefix
, c
->tty_rows
,
5892 prefix
, c
->tty_cols
);
5894 if (IN_SET(c
->std_output
,
5896 EXEC_OUTPUT_JOURNAL
,
5897 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
5898 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
) ||
5899 IN_SET(c
->std_error
,
5901 EXEC_OUTPUT_JOURNAL
,
5902 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
5903 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
)) {
5905 _cleanup_free_
char *fac_str
= NULL
, *lvl_str
= NULL
;
5907 r
= log_facility_unshifted_to_string_alloc(c
->syslog_priority
>> 3, &fac_str
);
5909 fprintf(f
, "%sSyslogFacility: %s\n", prefix
, fac_str
);
5911 r
= log_level_to_string_alloc(LOG_PRI(c
->syslog_priority
), &lvl_str
);
5913 fprintf(f
, "%sSyslogLevel: %s\n", prefix
, lvl_str
);
5916 if (c
->log_level_max
>= 0) {
5917 _cleanup_free_
char *t
= NULL
;
5919 (void) log_level_to_string_alloc(c
->log_level_max
, &t
);
5921 fprintf(f
, "%sLogLevelMax: %s\n", prefix
, strna(t
));
5924 if (c
->log_ratelimit_interval_usec
> 0)
5926 "%sLogRateLimitIntervalSec: %s\n",
5927 prefix
, FORMAT_TIMESPAN(c
->log_ratelimit_interval_usec
, USEC_PER_SEC
));
5929 if (c
->log_ratelimit_burst
> 0)
5930 fprintf(f
, "%sLogRateLimitBurst: %u\n", prefix
, c
->log_ratelimit_burst
);
5932 if (!set_isempty(c
->log_filter_allowed_patterns
) || !set_isempty(c
->log_filter_denied_patterns
)) {
5933 fprintf(f
, "%sLogFilterPatterns:", prefix
);
5936 SET_FOREACH(pattern
, c
->log_filter_allowed_patterns
)
5937 fprintf(f
, " %s", pattern
);
5938 SET_FOREACH(pattern
, c
->log_filter_denied_patterns
)
5939 fprintf(f
, " ~%s", pattern
);
5943 for (size_t j
= 0; j
< c
->n_log_extra_fields
; j
++) {
5944 fprintf(f
, "%sLogExtraFields: ", prefix
);
5945 fwrite(c
->log_extra_fields
[j
].iov_base
,
5946 1, c
->log_extra_fields
[j
].iov_len
,
5951 if (c
->log_namespace
)
5952 fprintf(f
, "%sLogNamespace: %s\n", prefix
, c
->log_namespace
);
5954 if (c
->secure_bits
) {
5955 _cleanup_free_
char *str
= NULL
;
5957 r
= secure_bits_to_string_alloc(c
->secure_bits
, &str
);
5959 fprintf(f
, "%sSecure Bits: %s\n", prefix
, str
);
5962 if (c
->capability_bounding_set
!= CAP_MASK_UNSET
) {
5963 _cleanup_free_
char *str
= NULL
;
5965 r
= capability_set_to_string(c
->capability_bounding_set
, &str
);
5967 fprintf(f
, "%sCapabilityBoundingSet: %s\n", prefix
, str
);
5970 if (c
->capability_ambient_set
!= 0) {
5971 _cleanup_free_
char *str
= NULL
;
5973 r
= capability_set_to_string(c
->capability_ambient_set
, &str
);
5975 fprintf(f
, "%sAmbientCapabilities: %s\n", prefix
, str
);
5979 fprintf(f
, "%sUser: %s\n", prefix
, c
->user
);
5981 fprintf(f
, "%sGroup: %s\n", prefix
, c
->group
);
5983 fprintf(f
, "%sDynamicUser: %s\n", prefix
, yes_no(c
->dynamic_user
));
5985 strv_dump(f
, prefix
, "SupplementaryGroups", c
->supplementary_groups
);
5988 fprintf(f
, "%sPAMName: %s\n", prefix
, c
->pam_name
);
5990 strv_dump(f
, prefix
, "ReadWritePaths", c
->read_write_paths
);
5991 strv_dump(f
, prefix
, "ReadOnlyPaths", c
->read_only_paths
);
5992 strv_dump(f
, prefix
, "InaccessiblePaths", c
->inaccessible_paths
);
5993 strv_dump(f
, prefix
, "ExecPaths", c
->exec_paths
);
5994 strv_dump(f
, prefix
, "NoExecPaths", c
->no_exec_paths
);
5995 strv_dump(f
, prefix
, "ExecSearchPath", c
->exec_search_path
);
5997 for (size_t i
= 0; i
< c
->n_bind_mounts
; i
++)
5998 fprintf(f
, "%s%s: %s%s:%s:%s\n", prefix
,
5999 c
->bind_mounts
[i
].read_only
? "BindReadOnlyPaths" : "BindPaths",
6000 c
->bind_mounts
[i
].ignore_enoent
? "-": "",
6001 c
->bind_mounts
[i
].source
,
6002 c
->bind_mounts
[i
].destination
,
6003 c
->bind_mounts
[i
].recursive
? "rbind" : "norbind");
6005 for (size_t i
= 0; i
< c
->n_temporary_filesystems
; i
++) {
6006 const TemporaryFileSystem
*t
= c
->temporary_filesystems
+ i
;
6008 fprintf(f
, "%sTemporaryFileSystem: %s%s%s\n", prefix
,
6010 isempty(t
->options
) ? "" : ":",
6011 strempty(t
->options
));
6016 "%sUtmpIdentifier: %s\n",
6017 prefix
, c
->utmp_id
);
6019 if (c
->selinux_context
)
6021 "%sSELinuxContext: %s%s\n",
6022 prefix
, c
->selinux_context_ignore
? "-" : "", c
->selinux_context
);
6024 if (c
->apparmor_profile
)
6026 "%sAppArmorProfile: %s%s\n",
6027 prefix
, c
->apparmor_profile_ignore
? "-" : "", c
->apparmor_profile
);
6029 if (c
->smack_process_label
)
6031 "%sSmackProcessLabel: %s%s\n",
6032 prefix
, c
->smack_process_label_ignore
? "-" : "", c
->smack_process_label
);
6034 if (c
->personality
!= PERSONALITY_INVALID
)
6036 "%sPersonality: %s\n",
6037 prefix
, strna(personality_to_string(c
->personality
)));
6040 "%sLockPersonality: %s\n",
6041 prefix
, yes_no(c
->lock_personality
));
6043 if (c
->syscall_filter
) {
6045 "%sSystemCallFilter: ",
6048 if (!c
->syscall_allow_list
)
6054 HASHMAP_FOREACH_KEY(val
, id
, c
->syscall_filter
) {
6055 _cleanup_free_
char *name
= NULL
;
6056 const char *errno_name
= NULL
;
6057 int num
= PTR_TO_INT(val
);
6064 name
= seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE
, PTR_TO_INT(id
) - 1);
6065 fputs(strna(name
), f
);
6068 errno_name
= seccomp_errno_or_action_to_string(num
);
6070 fprintf(f
, ":%s", errno_name
);
6072 fprintf(f
, ":%d", num
);
6080 if (c
->syscall_archs
) {
6082 "%sSystemCallArchitectures:",
6087 SET_FOREACH(id
, c
->syscall_archs
)
6088 fprintf(f
, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id
) - 1)));
6093 if (exec_context_restrict_namespaces_set(c
)) {
6094 _cleanup_free_
char *s
= NULL
;
6096 r
= namespace_flags_to_string(c
->restrict_namespaces
, &s
);
6098 fprintf(f
, "%sRestrictNamespaces: %s\n",
6103 if (exec_context_restrict_filesystems_set(c
)) {
6105 SET_FOREACH(fs
, c
->restrict_filesystems
)
6106 fprintf(f
, "%sRestrictFileSystems: %s\n", prefix
, fs
);
6110 if (c
->network_namespace_path
)
6112 "%sNetworkNamespacePath: %s\n",
6113 prefix
, c
->network_namespace_path
);
6115 if (c
->syscall_errno
> 0) {
6116 fprintf(f
, "%sSystemCallErrorNumber: ", prefix
);
6119 const char *errno_name
= seccomp_errno_or_action_to_string(c
->syscall_errno
);
6121 fputs(errno_name
, f
);
6123 fprintf(f
, "%d", c
->syscall_errno
);
6128 for (size_t i
= 0; i
< c
->n_mount_images
; i
++) {
6129 fprintf(f
, "%sMountImages: %s%s:%s", prefix
,
6130 c
->mount_images
[i
].ignore_enoent
? "-": "",
6131 c
->mount_images
[i
].source
,
6132 c
->mount_images
[i
].destination
);
6133 LIST_FOREACH(mount_options
, o
, c
->mount_images
[i
].mount_options
)
6134 fprintf(f
, ":%s:%s",
6135 partition_designator_to_string(o
->partition_designator
),
6136 strempty(o
->options
));
6140 for (size_t i
= 0; i
< c
->n_extension_images
; i
++) {
6141 fprintf(f
, "%sExtensionImages: %s%s", prefix
,
6142 c
->extension_images
[i
].ignore_enoent
? "-": "",
6143 c
->extension_images
[i
].source
);
6144 LIST_FOREACH(mount_options
, o
, c
->extension_images
[i
].mount_options
)
6145 fprintf(f
, ":%s:%s",
6146 partition_designator_to_string(o
->partition_designator
),
6147 strempty(o
->options
));
6151 strv_dump(f
, prefix
, "ExtensionDirectories", c
->extension_directories
);
6154 bool exec_context_maintains_privileges(const ExecContext
*c
) {
6157 /* Returns true if the process forked off would run under
6158 * an unchanged UID or as root. */
6163 if (streq(c
->user
, "root") || streq(c
->user
, "0"))
6169 int exec_context_get_effective_ioprio(const ExecContext
*c
) {
6177 p
= ioprio_get(IOPRIO_WHO_PROCESS
, 0);
6179 return IOPRIO_DEFAULT_CLASS_AND_PRIO
;
6181 return ioprio_normalize(p
);
6184 bool exec_context_get_effective_mount_apivfs(const ExecContext
*c
) {
6187 /* Explicit setting wins */
6188 if (c
->mount_apivfs_set
)
6189 return c
->mount_apivfs
;
6191 /* Default to "yes" if root directory or image are specified */
6192 if (exec_context_with_rootfs(c
))
6198 void exec_context_free_log_extra_fields(ExecContext
*c
) {
6201 for (size_t l
= 0; l
< c
->n_log_extra_fields
; l
++)
6202 free(c
->log_extra_fields
[l
].iov_base
);
6203 c
->log_extra_fields
= mfree(c
->log_extra_fields
);
6204 c
->n_log_extra_fields
= 0;
6207 void exec_context_revert_tty(ExecContext
*c
) {
6208 _cleanup_close_
int fd
= -EBADF
;
6215 /* First, reset the TTY (possibly kicking everybody else from the TTY) */
6216 exec_context_tty_reset(c
, NULL
);
6218 /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
6219 * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
6220 * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
6221 if (!exec_context_may_touch_tty(c
))
6224 path
= exec_context_tty_path(c
);
6228 fd
= open(path
, O_PATH
|O_CLOEXEC
);
6230 return (void) log_full_errno(errno
== ENOENT
? LOG_DEBUG
: LOG_WARNING
, errno
,
6231 "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
6234 if (fstat(fd
, &st
) < 0)
6235 return (void) log_warning_errno(errno
, "Failed to stat TTY '%s', ignoring: %m", path
);
6237 /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
6238 * if things are a character device, since a proper check either means we'd have to open the TTY and
6239 * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
6240 * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
6241 * with this at all? → https://github.com/systemd/systemd/issues/19213 */
6242 if (!S_ISCHR(st
.st_mode
))
6243 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path
);
6245 r
= fchmod_and_chown(fd
, TTY_MODE
, 0, TTY_GID
);
6247 log_warning_errno(r
, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path
);
6250 int exec_context_get_clean_directories(
6256 _cleanup_strv_free_
char **l
= NULL
;
6263 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
6264 if (!FLAGS_SET(mask
, 1U << t
))
6270 for (size_t i
= 0; i
< c
->directories
[t
].n_items
; i
++) {
6273 j
= path_join(prefix
[t
], c
->directories
[t
].items
[i
].path
);
6277 r
= strv_consume(&l
, j
);
6281 /* Also remove private directories unconditionally. */
6282 if (t
!= EXEC_DIRECTORY_CONFIGURATION
) {
6283 j
= path_join(prefix
[t
], "private", c
->directories
[t
].items
[i
].path
);
6287 r
= strv_consume(&l
, j
);
6292 STRV_FOREACH(symlink
, c
->directories
[t
].items
[i
].symlinks
) {
6293 j
= path_join(prefix
[t
], *symlink
);
6297 r
= strv_consume(&l
, j
);
6308 int exec_context_get_clean_mask(ExecContext
*c
, ExecCleanMask
*ret
) {
6309 ExecCleanMask mask
= 0;
6314 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++)
6315 if (c
->directories
[t
].n_items
> 0)
6322 void exec_status_start(ExecStatus
*s
, pid_t pid
) {
6329 dual_timestamp_get(&s
->start_timestamp
);
6332 void exec_status_exit(ExecStatus
*s
, const ExecContext
*context
, pid_t pid
, int code
, int status
) {
6340 dual_timestamp_get(&s
->exit_timestamp
);
6345 if (context
&& context
->utmp_id
)
6346 (void) utmp_put_dead_process(context
->utmp_id
, pid
, code
, status
);
6349 void exec_status_reset(ExecStatus
*s
) {
6352 *s
= (ExecStatus
) {};
6355 void exec_status_dump(const ExecStatus
*s
, FILE *f
, const char *prefix
) {
6362 prefix
= strempty(prefix
);
6365 "%sPID: "PID_FMT
"\n",
6368 if (dual_timestamp_is_set(&s
->start_timestamp
))
6370 "%sStart Timestamp: %s\n",
6371 prefix
, FORMAT_TIMESTAMP(s
->start_timestamp
.realtime
));
6373 if (dual_timestamp_is_set(&s
->exit_timestamp
))
6375 "%sExit Timestamp: %s\n"
6377 "%sExit Status: %i\n",
6378 prefix
, FORMAT_TIMESTAMP(s
->exit_timestamp
.realtime
),
6379 prefix
, sigchld_code_to_string(s
->code
),
6383 static void exec_command_dump(ExecCommand
*c
, FILE *f
, const char *prefix
) {
6384 _cleanup_free_
char *cmd
= NULL
;
6385 const char *prefix2
;
6390 prefix
= strempty(prefix
);
6391 prefix2
= strjoina(prefix
, "\t");
6393 cmd
= quote_command_line(c
->argv
, SHELL_ESCAPE_EMPTY
);
6396 "%sCommand Line: %s\n",
6397 prefix
, strnull(cmd
));
6399 exec_status_dump(&c
->exec_status
, f
, prefix2
);
6402 void exec_command_dump_list(ExecCommand
*c
, FILE *f
, const char *prefix
) {
6405 prefix
= strempty(prefix
);
6407 LIST_FOREACH(command
, i
, c
)
6408 exec_command_dump(i
, f
, prefix
);
6411 void exec_command_append_list(ExecCommand
**l
, ExecCommand
*e
) {
6418 /* It's kind of important, that we keep the order here */
6419 end
= LIST_FIND_TAIL(command
, *l
);
6420 LIST_INSERT_AFTER(command
, *l
, end
, e
);
6425 int exec_command_set(ExecCommand
*c
, const char *path
, ...) {
6433 l
= strv_new_ap(path
, ap
);
6445 free_and_replace(c
->path
, p
);
6447 return strv_free_and_replace(c
->argv
, l
);
6450 int exec_command_append(ExecCommand
*c
, const char *path
, ...) {
6451 _cleanup_strv_free_
char **l
= NULL
;
6459 l
= strv_new_ap(path
, ap
);
6465 r
= strv_extend_strv(&c
->argv
, l
, false);
6472 static char *destroy_tree(char *path
) {
6476 if (!path_equal(path
, RUN_SYSTEMD_EMPTY
)) {
6477 log_debug("Spawning process to nuke '%s'", path
);
6479 (void) asynchronous_rm_rf(path
, REMOVE_ROOT
|REMOVE_SUBVOLUME
|REMOVE_PHYSICAL
);
6485 static ExecSharedRuntime
* exec_shared_runtime_free(ExecSharedRuntime
*rt
) {
6490 (void) hashmap_remove(rt
->manager
->exec_shared_runtime_by_id
, rt
->id
);
6492 rt
->id
= mfree(rt
->id
);
6493 rt
->tmp_dir
= mfree(rt
->tmp_dir
);
6494 rt
->var_tmp_dir
= mfree(rt
->var_tmp_dir
);
6495 safe_close_pair(rt
->netns_storage_socket
);
6496 safe_close_pair(rt
->ipcns_storage_socket
);
6500 DEFINE_TRIVIAL_UNREF_FUNC(ExecSharedRuntime
, exec_shared_runtime
, exec_shared_runtime_free
);
6501 DEFINE_TRIVIAL_CLEANUP_FUNC(ExecSharedRuntime
*, exec_shared_runtime_free
);
6503 ExecSharedRuntime
* exec_shared_runtime_destroy(ExecSharedRuntime
*rt
) {
6507 assert(rt
->n_ref
> 0);
6513 rt
->tmp_dir
= destroy_tree(rt
->tmp_dir
);
6514 rt
->var_tmp_dir
= destroy_tree(rt
->var_tmp_dir
);
6516 return exec_shared_runtime_free(rt
);
6519 static int exec_shared_runtime_allocate(ExecSharedRuntime
**ret
, const char *id
) {
6520 _cleanup_free_
char *id_copy
= NULL
;
6521 ExecSharedRuntime
*n
;
6525 id_copy
= strdup(id
);
6529 n
= new(ExecSharedRuntime
, 1);
6533 *n
= (ExecSharedRuntime
) {
6534 .id
= TAKE_PTR(id_copy
),
6535 .netns_storage_socket
= PIPE_EBADF
,
6536 .ipcns_storage_socket
= PIPE_EBADF
,
6543 static int exec_shared_runtime_add(
6548 int netns_storage_socket
[2],
6549 int ipcns_storage_socket
[2],
6550 ExecSharedRuntime
**ret
) {
6552 _cleanup_(exec_shared_runtime_freep
) ExecSharedRuntime
*rt
= NULL
;
6558 /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
6560 r
= exec_shared_runtime_allocate(&rt
, id
);
6564 r
= hashmap_ensure_put(&m
->exec_shared_runtime_by_id
, &string_hash_ops
, rt
->id
, rt
);
6568 assert(!!rt
->tmp_dir
== !!rt
->var_tmp_dir
); /* We require both to be set together */
6569 rt
->tmp_dir
= TAKE_PTR(*tmp_dir
);
6570 rt
->var_tmp_dir
= TAKE_PTR(*var_tmp_dir
);
6572 if (netns_storage_socket
) {
6573 rt
->netns_storage_socket
[0] = TAKE_FD(netns_storage_socket
[0]);
6574 rt
->netns_storage_socket
[1] = TAKE_FD(netns_storage_socket
[1]);
6577 if (ipcns_storage_socket
) {
6578 rt
->ipcns_storage_socket
[0] = TAKE_FD(ipcns_storage_socket
[0]);
6579 rt
->ipcns_storage_socket
[1] = TAKE_FD(ipcns_storage_socket
[1]);
6586 /* do not remove created ExecSharedRuntime object when the operation succeeds. */
6591 static int exec_shared_runtime_make(
6593 const ExecContext
*c
,
6595 ExecSharedRuntime
**ret
) {
6597 _cleanup_(namespace_cleanup_tmpdirp
) char *tmp_dir
= NULL
, *var_tmp_dir
= NULL
;
6598 _cleanup_close_pair_
int netns_storage_socket
[2] = PIPE_EBADF
, ipcns_storage_socket
[2] = PIPE_EBADF
;
6605 /* It is not necessary to create ExecSharedRuntime object. */
6606 if (!exec_needs_network_namespace(c
) && !exec_needs_ipc_namespace(c
) && !c
->private_tmp
) {
6611 if (c
->private_tmp
&&
6612 !(prefixed_path_strv_contains(c
->inaccessible_paths
, "/tmp") &&
6613 (prefixed_path_strv_contains(c
->inaccessible_paths
, "/var/tmp") ||
6614 prefixed_path_strv_contains(c
->inaccessible_paths
, "/var")))) {
6615 r
= setup_tmp_dirs(id
, &tmp_dir
, &var_tmp_dir
);
6620 if (exec_needs_network_namespace(c
)) {
6621 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, netns_storage_socket
) < 0)
6625 if (exec_needs_ipc_namespace(c
)) {
6626 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, ipcns_storage_socket
) < 0)
6630 r
= exec_shared_runtime_add(m
, id
, &tmp_dir
, &var_tmp_dir
, netns_storage_socket
, ipcns_storage_socket
, ret
);
6637 int exec_shared_runtime_acquire(Manager
*m
, const ExecContext
*c
, const char *id
, bool create
, ExecSharedRuntime
**ret
) {
6638 ExecSharedRuntime
*rt
;
6645 rt
= hashmap_get(m
->exec_shared_runtime_by_id
, id
);
6647 /* We already have an ExecSharedRuntime object, let's increase the ref count and reuse it */
6655 /* If not found, then create a new object. */
6656 r
= exec_shared_runtime_make(m
, c
, id
, &rt
);
6660 /* When r == 0, it is not necessary to create ExecSharedRuntime object. */
6666 /* increment reference counter. */
6672 int exec_shared_runtime_serialize(const Manager
*m
, FILE *f
, FDSet
*fds
) {
6673 ExecSharedRuntime
*rt
;
6679 HASHMAP_FOREACH(rt
, m
->exec_shared_runtime_by_id
) {
6680 fprintf(f
, "exec-runtime=%s", rt
->id
);
6683 fprintf(f
, " tmp-dir=%s", rt
->tmp_dir
);
6685 if (rt
->var_tmp_dir
)
6686 fprintf(f
, " var-tmp-dir=%s", rt
->var_tmp_dir
);
6688 if (rt
->netns_storage_socket
[0] >= 0) {
6691 copy
= fdset_put_dup(fds
, rt
->netns_storage_socket
[0]);
6695 fprintf(f
, " netns-socket-0=%i", copy
);
6698 if (rt
->netns_storage_socket
[1] >= 0) {
6701 copy
= fdset_put_dup(fds
, rt
->netns_storage_socket
[1]);
6705 fprintf(f
, " netns-socket-1=%i", copy
);
6708 if (rt
->ipcns_storage_socket
[0] >= 0) {
6711 copy
= fdset_put_dup(fds
, rt
->ipcns_storage_socket
[0]);
6715 fprintf(f
, " ipcns-socket-0=%i", copy
);
6718 if (rt
->ipcns_storage_socket
[1] >= 0) {
6721 copy
= fdset_put_dup(fds
, rt
->ipcns_storage_socket
[1]);
6725 fprintf(f
, " ipcns-socket-1=%i", copy
);
6734 int exec_shared_runtime_deserialize_compat(Unit
*u
, const char *key
, const char *value
, FDSet
*fds
) {
6735 _cleanup_(exec_shared_runtime_freep
) ExecSharedRuntime
*rt_create
= NULL
;
6736 ExecSharedRuntime
*rt
;
6739 /* This is for the migration from old (v237 or earlier) deserialization text.
6740 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
6741 * Even if the ExecSharedRuntime object originally created by the other unit, we cannot judge
6742 * so or not from the serialized text, then we always creates a new object owned by this. */
6748 /* Manager manages ExecSharedRuntime objects by the unit id.
6749 * So, we omit the serialized text when the unit does not have id (yet?)... */
6750 if (isempty(u
->id
)) {
6751 log_unit_debug(u
, "Invocation ID not found. Dropping runtime parameter.");
6755 if (hashmap_ensure_allocated(&u
->manager
->exec_shared_runtime_by_id
, &string_hash_ops
) < 0)
6758 rt
= hashmap_get(u
->manager
->exec_shared_runtime_by_id
, u
->id
);
6760 if (exec_shared_runtime_allocate(&rt_create
, u
->id
) < 0)
6766 if (streq(key
, "tmp-dir")) {
6767 if (free_and_strdup_warn(&rt
->tmp_dir
, value
) < 0)
6770 } else if (streq(key
, "var-tmp-dir")) {
6771 if (free_and_strdup_warn(&rt
->var_tmp_dir
, value
) < 0)
6774 } else if (streq(key
, "netns-socket-0")) {
6777 if ((fd
= parse_fd(value
)) < 0 || !fdset_contains(fds
, fd
)) {
6778 log_unit_debug(u
, "Failed to parse netns socket value: %s", value
);
6782 safe_close(rt
->netns_storage_socket
[0]);
6783 rt
->netns_storage_socket
[0] = fdset_remove(fds
, fd
);
6785 } else if (streq(key
, "netns-socket-1")) {
6788 if ((fd
= parse_fd(value
)) < 0 || !fdset_contains(fds
, fd
)) {
6789 log_unit_debug(u
, "Failed to parse netns socket value: %s", value
);
6793 safe_close(rt
->netns_storage_socket
[1]);
6794 rt
->netns_storage_socket
[1] = fdset_remove(fds
, fd
);
6799 /* If the object is newly created, then put it to the hashmap which manages ExecSharedRuntime objects. */
6801 r
= hashmap_put(u
->manager
->exec_shared_runtime_by_id
, rt_create
->id
, rt_create
);
6803 log_unit_debug_errno(u
, r
, "Failed to put runtime parameter to manager's storage: %m");
6807 rt_create
->manager
= u
->manager
;
6810 TAKE_PTR(rt_create
);
6816 int exec_shared_runtime_deserialize_one(Manager
*m
, const char *value
, FDSet
*fds
) {
6817 _cleanup_free_
char *tmp_dir
= NULL
, *var_tmp_dir
= NULL
;
6819 int r
, netns_fdpair
[] = {-1, -1}, ipcns_fdpair
[] = {-1, -1};
6820 const char *p
, *v
= ASSERT_PTR(value
);
6826 n
= strcspn(v
, " ");
6827 id
= strndupa_safe(v
, n
);
6832 v
= startswith(p
, "tmp-dir=");
6834 n
= strcspn(v
, " ");
6835 tmp_dir
= strndup(v
, n
);
6843 v
= startswith(p
, "var-tmp-dir=");
6845 n
= strcspn(v
, " ");
6846 var_tmp_dir
= strndup(v
, n
);
6854 v
= startswith(p
, "netns-socket-0=");
6858 n
= strcspn(v
, " ");
6859 buf
= strndupa_safe(v
, n
);
6861 netns_fdpair
[0] = parse_fd(buf
);
6862 if (netns_fdpair
[0] < 0)
6863 return log_debug_errno(netns_fdpair
[0], "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf
);
6864 if (!fdset_contains(fds
, netns_fdpair
[0]))
6865 return log_debug_errno(SYNTHETIC_ERRNO(EBADF
),
6866 "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair
[0]);
6867 netns_fdpair
[0] = fdset_remove(fds
, netns_fdpair
[0]);
6873 v
= startswith(p
, "netns-socket-1=");
6877 n
= strcspn(v
, " ");
6878 buf
= strndupa_safe(v
, n
);
6880 netns_fdpair
[1] = parse_fd(buf
);
6881 if (netns_fdpair
[1] < 0)
6882 return log_debug_errno(netns_fdpair
[1], "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf
);
6883 if (!fdset_contains(fds
, netns_fdpair
[1]))
6884 return log_debug_errno(SYNTHETIC_ERRNO(EBADF
),
6885 "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair
[1]);
6886 netns_fdpair
[1] = fdset_remove(fds
, netns_fdpair
[1]);
6892 v
= startswith(p
, "ipcns-socket-0=");
6896 n
= strcspn(v
, " ");
6897 buf
= strndupa_safe(v
, n
);
6899 ipcns_fdpair
[0] = parse_fd(buf
);
6900 if (ipcns_fdpair
[0] < 0)
6901 return log_debug_errno(ipcns_fdpair
[0], "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf
);
6902 if (!fdset_contains(fds
, ipcns_fdpair
[0]))
6903 return log_debug_errno(SYNTHETIC_ERRNO(EBADF
),
6904 "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair
[0]);
6905 ipcns_fdpair
[0] = fdset_remove(fds
, ipcns_fdpair
[0]);
6911 v
= startswith(p
, "ipcns-socket-1=");
6915 n
= strcspn(v
, " ");
6916 buf
= strndupa_safe(v
, n
);
6918 ipcns_fdpair
[1] = parse_fd(buf
);
6919 if (ipcns_fdpair
[1] < 0)
6920 return log_debug_errno(ipcns_fdpair
[1], "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf
);
6921 if (!fdset_contains(fds
, ipcns_fdpair
[1]))
6922 return log_debug_errno(SYNTHETIC_ERRNO(EBADF
),
6923 "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair
[1]);
6924 ipcns_fdpair
[1] = fdset_remove(fds
, ipcns_fdpair
[1]);
6928 r
= exec_shared_runtime_add(m
, id
, &tmp_dir
, &var_tmp_dir
, netns_fdpair
, ipcns_fdpair
, NULL
);
6930 return log_debug_errno(r
, "Failed to add exec-runtime: %m");
6934 void exec_shared_runtime_vacuum(Manager
*m
) {
6935 ExecSharedRuntime
*rt
;
6939 /* Free unreferenced ExecSharedRuntime objects. This is used after manager deserialization process. */
6941 HASHMAP_FOREACH(rt
, m
->exec_shared_runtime_by_id
) {
6945 (void) exec_shared_runtime_free(rt
);
6949 int exec_runtime_make(
6951 const ExecContext
*context
,
6952 ExecSharedRuntime
*shared
,
6953 DynamicCreds
*creds
,
6954 ExecRuntime
**ret
) {
6955 _cleanup_close_pair_
int ephemeral_storage_socket
[2] = PIPE_EBADF
;
6956 _cleanup_free_
char *ephemeral
= NULL
;
6957 _cleanup_(exec_runtime_freep
) ExecRuntime
*rt
= NULL
;
6964 if (!shared
&& !creds
&& !exec_needs_ephemeral(context
)) {
6969 if (exec_needs_ephemeral(context
)) {
6970 r
= mkdir_p("/var/lib/systemd/ephemeral-trees", 0755);
6974 r
= tempfn_random_child("/var/lib/systemd/ephemeral-trees", unit
->id
, &ephemeral
);
6978 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, ephemeral_storage_socket
) < 0)
6982 rt
= new(ExecRuntime
, 1);
6986 *rt
= (ExecRuntime
) {
6988 .dynamic_creds
= creds
,
6989 .ephemeral_copy
= TAKE_PTR(ephemeral
),
6990 .ephemeral_storage_socket
[0] = TAKE_FD(ephemeral_storage_socket
[0]),
6991 .ephemeral_storage_socket
[1] = TAKE_FD(ephemeral_storage_socket
[1]),
6994 *ret
= TAKE_PTR(rt
);
6998 ExecRuntime
* exec_runtime_free(ExecRuntime
*rt
) {
7002 exec_shared_runtime_unref(rt
->shared
);
7003 dynamic_creds_unref(rt
->dynamic_creds
);
7005 rt
->ephemeral_copy
= destroy_tree(rt
->ephemeral_copy
);
7007 safe_close_pair(rt
->ephemeral_storage_socket
);
7011 ExecRuntime
* exec_runtime_destroy(ExecRuntime
*rt
) {
7015 rt
->shared
= exec_shared_runtime_destroy(rt
->shared
);
7016 rt
->dynamic_creds
= dynamic_creds_destroy(rt
->dynamic_creds
);
7017 return exec_runtime_free(rt
);
7020 void exec_params_clear(ExecParameters
*p
) {
7024 p
->environment
= strv_free(p
->environment
);
7025 p
->fd_names
= strv_free(p
->fd_names
);
7026 p
->fds
= mfree(p
->fds
);
7027 p
->exec_fd
= safe_close(p
->exec_fd
);
7030 void exec_directory_done(ExecDirectory
*d
) {
7034 for (size_t i
= 0; i
< d
->n_items
; i
++) {
7035 free(d
->items
[i
].path
);
7036 strv_free(d
->items
[i
].symlinks
);
7039 d
->items
= mfree(d
->items
);
7044 static ExecDirectoryItem
*exec_directory_find(ExecDirectory
*d
, const char *path
) {
7048 for (size_t i
= 0; i
< d
->n_items
; i
++)
7049 if (path_equal(d
->items
[i
].path
, path
))
7050 return &d
->items
[i
];
7055 int exec_directory_add(ExecDirectory
*d
, const char *path
, const char *symlink
) {
7056 _cleanup_strv_free_
char **s
= NULL
;
7057 _cleanup_free_
char *p
= NULL
;
7058 ExecDirectoryItem
*existing
;
7064 existing
= exec_directory_find(d
, path
);
7066 r
= strv_extend(&existing
->symlinks
, symlink
);
7070 return 0; /* existing item is updated */
7078 s
= strv_new(symlink
);
7083 if (!GREEDY_REALLOC(d
->items
, d
->n_items
+ 1))
7086 d
->items
[d
->n_items
++] = (ExecDirectoryItem
) {
7087 .path
= TAKE_PTR(p
),
7088 .symlinks
= TAKE_PTR(s
),
7091 return 1; /* new item is added */
7094 static int exec_directory_item_compare_func(const ExecDirectoryItem
*a
, const ExecDirectoryItem
*b
) {
7098 return path_compare(a
->path
, b
->path
);
7101 void exec_directory_sort(ExecDirectory
*d
) {
7104 /* Sort the exec directories to make always parent directories processed at first in
7105 * setup_exec_directory(), e.g., even if StateDirectory=foo/bar foo, we need to create foo at first,
7106 * then foo/bar. Also, set .only_create flag if one of the parent directories is contained in the
7107 * list. See also comments in setup_exec_directory() and issue #24783. */
7109 if (d
->n_items
<= 1)
7112 typesafe_qsort(d
->items
, d
->n_items
, exec_directory_item_compare_func
);
7114 for (size_t i
= 1; i
< d
->n_items
; i
++)
7115 for (size_t j
= 0; j
< i
; j
++)
7116 if (path_startswith(d
->items
[i
].path
, d
->items
[j
].path
)) {
7117 d
->items
[i
].only_create
= true;
7122 ExecCleanMask
exec_clean_mask_from_string(const char *s
) {
7123 ExecDirectoryType t
;
7127 if (streq(s
, "all"))
7128 return EXEC_CLEAN_ALL
;
7129 if (streq(s
, "fdstore"))
7130 return EXEC_CLEAN_FDSTORE
;
7132 t
= exec_resource_type_from_string(s
);
7134 return (ExecCleanMask
) t
;
7139 static const char* const exec_input_table
[_EXEC_INPUT_MAX
] = {
7140 [EXEC_INPUT_NULL
] = "null",
7141 [EXEC_INPUT_TTY
] = "tty",
7142 [EXEC_INPUT_TTY_FORCE
] = "tty-force",
7143 [EXEC_INPUT_TTY_FAIL
] = "tty-fail",
7144 [EXEC_INPUT_SOCKET
] = "socket",
7145 [EXEC_INPUT_NAMED_FD
] = "fd",
7146 [EXEC_INPUT_DATA
] = "data",
7147 [EXEC_INPUT_FILE
] = "file",
7150 DEFINE_STRING_TABLE_LOOKUP(exec_input
, ExecInput
);
7152 static const char* const exec_output_table
[_EXEC_OUTPUT_MAX
] = {
7153 [EXEC_OUTPUT_INHERIT
] = "inherit",
7154 [EXEC_OUTPUT_NULL
] = "null",
7155 [EXEC_OUTPUT_TTY
] = "tty",
7156 [EXEC_OUTPUT_KMSG
] = "kmsg",
7157 [EXEC_OUTPUT_KMSG_AND_CONSOLE
] = "kmsg+console",
7158 [EXEC_OUTPUT_JOURNAL
] = "journal",
7159 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE
] = "journal+console",
7160 [EXEC_OUTPUT_SOCKET
] = "socket",
7161 [EXEC_OUTPUT_NAMED_FD
] = "fd",
7162 [EXEC_OUTPUT_FILE
] = "file",
7163 [EXEC_OUTPUT_FILE_APPEND
] = "append",
7164 [EXEC_OUTPUT_FILE_TRUNCATE
] = "truncate",
7167 DEFINE_STRING_TABLE_LOOKUP(exec_output
, ExecOutput
);
7169 static const char* const exec_utmp_mode_table
[_EXEC_UTMP_MODE_MAX
] = {
7170 [EXEC_UTMP_INIT
] = "init",
7171 [EXEC_UTMP_LOGIN
] = "login",
7172 [EXEC_UTMP_USER
] = "user",
7175 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode
, ExecUtmpMode
);
7177 static const char* const exec_preserve_mode_table
[_EXEC_PRESERVE_MODE_MAX
] = {
7178 [EXEC_PRESERVE_NO
] = "no",
7179 [EXEC_PRESERVE_YES
] = "yes",
7180 [EXEC_PRESERVE_RESTART
] = "restart",
7183 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode
, ExecPreserveMode
, EXEC_PRESERVE_YES
);
7185 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
7186 static const char* const exec_directory_type_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
7187 [EXEC_DIRECTORY_RUNTIME
] = "RuntimeDirectory",
7188 [EXEC_DIRECTORY_STATE
] = "StateDirectory",
7189 [EXEC_DIRECTORY_CACHE
] = "CacheDirectory",
7190 [EXEC_DIRECTORY_LOGS
] = "LogsDirectory",
7191 [EXEC_DIRECTORY_CONFIGURATION
] = "ConfigurationDirectory",
7194 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type
, ExecDirectoryType
);
7196 /* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
7197 static const char* const exec_directory_type_symlink_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
7198 [EXEC_DIRECTORY_RUNTIME
] = "RuntimeDirectorySymlink",
7199 [EXEC_DIRECTORY_STATE
] = "StateDirectorySymlink",
7200 [EXEC_DIRECTORY_CACHE
] = "CacheDirectorySymlink",
7201 [EXEC_DIRECTORY_LOGS
] = "LogsDirectorySymlink",
7202 [EXEC_DIRECTORY_CONFIGURATION
] = "ConfigurationDirectorySymlink",
7205 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink
, ExecDirectoryType
);
7207 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
7208 * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
7209 * directories, specifically .timer units with their timestamp touch file. */
7210 static const char* const exec_resource_type_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
7211 [EXEC_DIRECTORY_RUNTIME
] = "runtime",
7212 [EXEC_DIRECTORY_STATE
] = "state",
7213 [EXEC_DIRECTORY_CACHE
] = "cache",
7214 [EXEC_DIRECTORY_LOGS
] = "logs",
7215 [EXEC_DIRECTORY_CONFIGURATION
] = "configuration",
7218 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type
, ExecDirectoryType
);
7220 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
7221 * the service payload in. */
7222 static const char* const exec_directory_env_name_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
7223 [EXEC_DIRECTORY_RUNTIME
] = "RUNTIME_DIRECTORY",
7224 [EXEC_DIRECTORY_STATE
] = "STATE_DIRECTORY",
7225 [EXEC_DIRECTORY_CACHE
] = "CACHE_DIRECTORY",
7226 [EXEC_DIRECTORY_LOGS
] = "LOGS_DIRECTORY",
7227 [EXEC_DIRECTORY_CONFIGURATION
] = "CONFIGURATION_DIRECTORY",
7230 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name
, ExecDirectoryType
);
7232 static const char* const exec_keyring_mode_table
[_EXEC_KEYRING_MODE_MAX
] = {
7233 [EXEC_KEYRING_INHERIT
] = "inherit",
7234 [EXEC_KEYRING_PRIVATE
] = "private",
7235 [EXEC_KEYRING_SHARED
] = "shared",
7238 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode
, ExecKeyringMode
);