1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
6 #include <sys/eventfd.h>
10 #include <sys/personality.h>
11 #include <sys/prctl.h>
13 #include <sys/types.h>
18 #include <linux/fs.h> /* Must be included after <sys/mount.h> */
21 #include <security/pam_appl.h>
25 #include <selinux/selinux.h>
29 #include <sys/apparmor.h>
32 #include "sd-messages.h"
35 #include "alloc-util.h"
37 #include "apparmor-util.h"
39 #include "argv-util.h"
43 #include "btrfs-util.h"
45 #include "capability-util.h"
46 #include "chattr-util.h"
47 #include "cgroup-setup.h"
49 #include "chown-recursive.h"
50 #include "constants.h"
51 #include "cpu-set-util.h"
52 #include "data-fd-util.h"
55 #include "errno-list.h"
57 #include "exec-credential.h"
59 #include "exit-status.h"
61 #include "format-util.h"
62 #include "glob-util.h"
63 #include "hexdecoct.h"
65 #include "ioprio-util.h"
66 #include "lock-util.h"
70 #include "manager-dump.h"
71 #include "memory-util.h"
72 #include "missing_fs.h"
73 #include "missing_ioprio.h"
74 #include "missing_prctl.h"
75 #include "mkdir-label.h"
76 #include "namespace.h"
77 #include "parse-util.h"
78 #include "path-util.h"
79 #include "proc-cmdline.h"
80 #include "process-util.h"
82 #include "rlimit-util.h"
84 #include "seccomp-util.h"
85 #include "securebits-util.h"
86 #include "selinux-util.h"
87 #include "signal-util.h"
88 #include "smack-util.h"
89 #include "socket-util.h"
90 #include "sort-util.h"
92 #include "stat-util.h"
93 #include "string-table.h"
94 #include "string-util.h"
96 #include "syslog-util.h"
97 #include "terminal-util.h"
98 #include "tmpfile-util.h"
99 #include "umask-util.h"
100 #include "unit-serialize.h"
101 #include "user-util.h"
102 #include "utmp-wtmp.h"
104 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
105 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
107 #define SNDBUF_SIZE (8*1024*1024)
109 static int shift_fds(int fds
[], size_t n_fds
) {
113 /* Modifies the fds array! (sorts it) */
117 for (int start
= 0;;) {
118 int restart_from
= -1;
120 for (int i
= start
; i
< (int) n_fds
; i
++) {
123 /* Already at right index? */
127 nfd
= fcntl(fds
[i
], F_DUPFD
, i
+ 3);
134 /* Hmm, the fd we wanted isn't free? Then
135 * let's remember that and try again from here */
136 if (nfd
!= i
+3 && restart_from
< 0)
140 if (restart_from
< 0)
143 start
= restart_from
;
149 static int flags_fds(
162 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
163 * O_NONBLOCK only applies to socket activation though. */
165 for (size_t i
= 0; i
< n_fds
; i
++) {
167 if (i
< n_socket_fds
) {
168 r
= fd_nonblock(fds
[i
], nonblock
);
173 /* We unconditionally drop FD_CLOEXEC from the fds,
174 * since after all we want to pass these fds to our
177 r
= fd_cloexec(fds
[i
], false);
185 static const char *exec_context_tty_path(const ExecContext
*context
) {
188 if (context
->stdio_as_fds
)
191 if (context
->tty_path
)
192 return context
->tty_path
;
194 return "/dev/console";
197 static int exec_context_tty_size(const ExecContext
*context
, unsigned *ret_rows
, unsigned *ret_cols
) {
205 rows
= context
->tty_rows
;
206 cols
= context
->tty_cols
;
208 tty
= exec_context_tty_path(context
);
210 (void) proc_cmdline_tty_size(tty
, rows
== UINT_MAX
? &rows
: NULL
, cols
== UINT_MAX
? &cols
: NULL
);
218 static void exec_context_tty_reset(const ExecContext
*context
, const ExecParameters
*p
) {
219 _cleanup_close_
int fd
= -EBADF
;
220 const char *path
= exec_context_tty_path(ASSERT_PTR(context
));
222 /* Take a lock around the device for the duration of the setup that we do here.
223 * systemd-vconsole-setup.service also takes the lock to avoid being interrupted.
224 * We open a new fd that will be closed automatically, and operate on it for convenience.
227 if (p
&& p
->stdin_fd
>= 0) {
228 fd
= xopenat_lock(p
->stdin_fd
, NULL
,
229 O_RDONLY
|O_CLOEXEC
|O_NONBLOCK
|O_NOCTTY
, 0, 0, LOCK_BSD
, LOCK_EX
);
233 fd
= open_terminal(path
, O_RDWR
|O_NOCTTY
|O_CLOEXEC
|O_NONBLOCK
);
237 if (lock_generic(fd
, LOCK_BSD
, LOCK_EX
) < 0)
240 return; /* nothing to do */
242 if (context
->tty_vhangup
)
243 (void) terminal_vhangup_fd(fd
);
245 if (context
->tty_reset
)
246 (void) reset_terminal_fd(fd
, true);
248 if (p
&& p
->stdin_fd
>= 0) {
249 unsigned rows
= context
->tty_rows
, cols
= context
->tty_cols
;
251 (void) exec_context_tty_size(context
, &rows
, &cols
);
252 (void) terminal_set_size_fd(p
->stdin_fd
, path
, rows
, cols
);
255 if (context
->tty_vt_disallocate
&& path
)
256 (void) vt_disallocate(path
);
259 static bool is_terminal_input(ExecInput i
) {
262 EXEC_INPUT_TTY_FORCE
,
263 EXEC_INPUT_TTY_FAIL
);
266 static bool is_terminal_output(ExecOutput o
) {
269 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
270 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
);
273 static bool is_kmsg_output(ExecOutput o
) {
276 EXEC_OUTPUT_KMSG_AND_CONSOLE
);
279 static bool exec_context_needs_term(const ExecContext
*c
) {
282 /* Return true if the execution context suggests we should set $TERM to something useful. */
284 if (is_terminal_input(c
->std_input
))
287 if (is_terminal_output(c
->std_output
))
290 if (is_terminal_output(c
->std_error
))
293 return !!c
->tty_path
;
296 static int open_null_as(int flags
, int nfd
) {
301 fd
= open("/dev/null", flags
|O_NOCTTY
);
305 return move_fd(fd
, nfd
, false);
308 static int connect_journal_socket(
310 const char *log_namespace
,
314 uid_t olduid
= UID_INVALID
;
315 gid_t oldgid
= GID_INVALID
;
320 strjoina("/run/systemd/journal.", log_namespace
, "/stdout") :
321 "/run/systemd/journal/stdout";
323 if (gid_is_valid(gid
)) {
326 if (setegid(gid
) < 0)
330 if (uid_is_valid(uid
)) {
333 if (seteuid(uid
) < 0) {
339 r
= connect_unix_path(fd
, AT_FDCWD
, j
);
341 /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
342 an LSM interferes. */
344 if (uid_is_valid(uid
))
345 (void) seteuid(olduid
);
348 if (gid_is_valid(gid
))
349 (void) setegid(oldgid
);
354 static int connect_logger_as(
356 const ExecContext
*context
,
357 const ExecParameters
*params
,
364 _cleanup_close_
int fd
= -EBADF
;
369 assert(output
< _EXEC_OUTPUT_MAX
);
373 fd
= socket(AF_UNIX
, SOCK_STREAM
, 0);
377 r
= connect_journal_socket(fd
, context
->log_namespace
, uid
, gid
);
381 if (shutdown(fd
, SHUT_RD
) < 0)
384 (void) fd_inc_sndbuf(fd
, SNDBUF_SIZE
);
394 context
->syslog_identifier
?: ident
,
395 params
->flags
& EXEC_PASS_LOG_UNIT
? unit
->id
: "",
396 context
->syslog_priority
,
397 !!context
->syslog_level_prefix
,
399 is_kmsg_output(output
),
400 is_terminal_output(output
)) < 0)
403 return move_fd(TAKE_FD(fd
), nfd
, false);
406 static int open_terminal_as(const char *path
, int flags
, int nfd
) {
412 fd
= open_terminal(path
, flags
| O_NOCTTY
);
416 return move_fd(fd
, nfd
, false);
419 static int acquire_path(const char *path
, int flags
, mode_t mode
) {
420 _cleanup_close_
int fd
= -EBADF
;
425 if (IN_SET(flags
& O_ACCMODE
, O_WRONLY
, O_RDWR
))
428 fd
= open(path
, flags
|O_NOCTTY
, mode
);
432 if (errno
!= ENXIO
) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
435 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
437 fd
= socket(AF_UNIX
, SOCK_STREAM
, 0);
441 r
= connect_unix_path(fd
, AT_FDCWD
, path
);
442 if (IN_SET(r
, -ENOTSOCK
, -EINVAL
))
443 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
444 * wasn't an AF_UNIX socket after all */
449 if ((flags
& O_ACCMODE
) == O_RDONLY
)
450 r
= shutdown(fd
, SHUT_WR
);
451 else if ((flags
& O_ACCMODE
) == O_WRONLY
)
452 r
= shutdown(fd
, SHUT_RD
);
461 static int fixup_input(
462 const ExecContext
*context
,
464 bool apply_tty_stdin
) {
470 std_input
= context
->std_input
;
472 if (is_terminal_input(std_input
) && !apply_tty_stdin
)
473 return EXEC_INPUT_NULL
;
475 if (std_input
== EXEC_INPUT_SOCKET
&& socket_fd
< 0)
476 return EXEC_INPUT_NULL
;
478 if (std_input
== EXEC_INPUT_DATA
&& context
->stdin_data_size
== 0)
479 return EXEC_INPUT_NULL
;
484 static int fixup_output(ExecOutput output
, int socket_fd
) {
486 if (output
== EXEC_OUTPUT_SOCKET
&& socket_fd
< 0)
487 return EXEC_OUTPUT_INHERIT
;
492 static int setup_input(
493 const ExecContext
*context
,
494 const ExecParameters
*params
,
496 const int named_iofds
[static 3]) {
505 if (params
->stdin_fd
>= 0) {
506 if (dup2(params
->stdin_fd
, STDIN_FILENO
) < 0)
509 /* Try to make this the controlling tty, if it is a tty, and reset it */
510 if (isatty(STDIN_FILENO
)) {
511 unsigned rows
= context
->tty_rows
, cols
= context
->tty_cols
;
513 (void) exec_context_tty_size(context
, &rows
, &cols
);
514 (void) ioctl(STDIN_FILENO
, TIOCSCTTY
, context
->std_input
== EXEC_INPUT_TTY_FORCE
);
515 (void) reset_terminal_fd(STDIN_FILENO
, true);
516 (void) terminal_set_size_fd(STDIN_FILENO
, NULL
, rows
, cols
);
522 i
= fixup_input(context
, socket_fd
, params
->flags
& EXEC_APPLY_TTY_STDIN
);
526 case EXEC_INPUT_NULL
:
527 return open_null_as(O_RDONLY
, STDIN_FILENO
);
530 case EXEC_INPUT_TTY_FORCE
:
531 case EXEC_INPUT_TTY_FAIL
: {
535 fd
= acquire_terminal(exec_context_tty_path(context
),
536 i
== EXEC_INPUT_TTY_FAIL
? ACQUIRE_TERMINAL_TRY
:
537 i
== EXEC_INPUT_TTY_FORCE
? ACQUIRE_TERMINAL_FORCE
:
538 ACQUIRE_TERMINAL_WAIT
,
543 r
= exec_context_tty_size(context
, &rows
, &cols
);
547 r
= terminal_set_size_fd(fd
, exec_context_tty_path(context
), rows
, cols
);
551 return move_fd(fd
, STDIN_FILENO
, false);
554 case EXEC_INPUT_SOCKET
:
555 assert(socket_fd
>= 0);
557 return RET_NERRNO(dup2(socket_fd
, STDIN_FILENO
));
559 case EXEC_INPUT_NAMED_FD
:
560 assert(named_iofds
[STDIN_FILENO
] >= 0);
562 (void) fd_nonblock(named_iofds
[STDIN_FILENO
], false);
563 return RET_NERRNO(dup2(named_iofds
[STDIN_FILENO
], STDIN_FILENO
));
565 case EXEC_INPUT_DATA
: {
568 fd
= acquire_data_fd(context
->stdin_data
, context
->stdin_data_size
, 0);
572 return move_fd(fd
, STDIN_FILENO
, false);
575 case EXEC_INPUT_FILE
: {
579 assert(context
->stdio_file
[STDIN_FILENO
]);
581 rw
= (context
->std_output
== EXEC_OUTPUT_FILE
&& streq_ptr(context
->stdio_file
[STDIN_FILENO
], context
->stdio_file
[STDOUT_FILENO
])) ||
582 (context
->std_error
== EXEC_OUTPUT_FILE
&& streq_ptr(context
->stdio_file
[STDIN_FILENO
], context
->stdio_file
[STDERR_FILENO
]));
584 fd
= acquire_path(context
->stdio_file
[STDIN_FILENO
], rw
? O_RDWR
: O_RDONLY
, 0666 & ~context
->umask
);
588 return move_fd(fd
, STDIN_FILENO
, false);
592 assert_not_reached();
596 static bool can_inherit_stderr_from_stdout(
597 const ExecContext
*context
,
603 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
606 if (e
== EXEC_OUTPUT_INHERIT
)
611 if (e
== EXEC_OUTPUT_NAMED_FD
)
612 return streq_ptr(context
->stdio_fdname
[STDOUT_FILENO
], context
->stdio_fdname
[STDERR_FILENO
]);
614 if (IN_SET(e
, EXEC_OUTPUT_FILE
, EXEC_OUTPUT_FILE_APPEND
, EXEC_OUTPUT_FILE_TRUNCATE
))
615 return streq_ptr(context
->stdio_file
[STDOUT_FILENO
], context
->stdio_file
[STDERR_FILENO
]);
620 static int setup_output(
622 const ExecContext
*context
,
623 const ExecParameters
*params
,
626 const int named_iofds
[static 3],
630 dev_t
*journal_stream_dev
,
631 ino_t
*journal_stream_ino
) {
641 assert(journal_stream_dev
);
642 assert(journal_stream_ino
);
644 if (fileno
== STDOUT_FILENO
&& params
->stdout_fd
>= 0) {
646 if (dup2(params
->stdout_fd
, STDOUT_FILENO
) < 0)
649 return STDOUT_FILENO
;
652 if (fileno
== STDERR_FILENO
&& params
->stderr_fd
>= 0) {
653 if (dup2(params
->stderr_fd
, STDERR_FILENO
) < 0)
656 return STDERR_FILENO
;
659 i
= fixup_input(context
, socket_fd
, params
->flags
& EXEC_APPLY_TTY_STDIN
);
660 o
= fixup_output(context
->std_output
, socket_fd
);
662 if (fileno
== STDERR_FILENO
) {
664 e
= fixup_output(context
->std_error
, socket_fd
);
666 /* This expects the input and output are already set up */
668 /* Don't change the stderr file descriptor if we inherit all
669 * the way and are not on a tty */
670 if (e
== EXEC_OUTPUT_INHERIT
&&
671 o
== EXEC_OUTPUT_INHERIT
&&
672 i
== EXEC_INPUT_NULL
&&
673 !is_terminal_input(context
->std_input
) &&
677 /* Duplicate from stdout if possible */
678 if (can_inherit_stderr_from_stdout(context
, o
, e
))
679 return RET_NERRNO(dup2(STDOUT_FILENO
, fileno
));
683 } else if (o
== EXEC_OUTPUT_INHERIT
) {
684 /* If input got downgraded, inherit the original value */
685 if (i
== EXEC_INPUT_NULL
&& is_terminal_input(context
->std_input
))
686 return open_terminal_as(exec_context_tty_path(context
), O_WRONLY
, fileno
);
688 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
689 if (!IN_SET(i
, EXEC_INPUT_NULL
, EXEC_INPUT_DATA
))
690 return RET_NERRNO(dup2(STDIN_FILENO
, fileno
));
692 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
696 /* We need to open /dev/null here anew, to get the right access mode. */
697 return open_null_as(O_WRONLY
, fileno
);
702 case EXEC_OUTPUT_NULL
:
703 return open_null_as(O_WRONLY
, fileno
);
705 case EXEC_OUTPUT_TTY
:
706 if (is_terminal_input(i
))
707 return RET_NERRNO(dup2(STDIN_FILENO
, fileno
));
709 /* We don't reset the terminal if this is just about output */
710 return open_terminal_as(exec_context_tty_path(context
), O_WRONLY
, fileno
);
712 case EXEC_OUTPUT_KMSG
:
713 case EXEC_OUTPUT_KMSG_AND_CONSOLE
:
714 case EXEC_OUTPUT_JOURNAL
:
715 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE
:
716 r
= connect_logger_as(unit
, context
, params
, o
, ident
, fileno
, uid
, gid
);
718 log_unit_warning_errno(unit
, r
, "Failed to connect %s to the journal socket, ignoring: %m",
719 fileno
== STDOUT_FILENO
? "stdout" : "stderr");
720 r
= open_null_as(O_WRONLY
, fileno
);
724 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
725 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
726 * services to detect whether they are connected to the journal or not.
728 * If both stdout and stderr are connected to a stream then let's make sure to store the data
729 * about STDERR as that's usually the best way to do logging. */
731 if (fstat(fileno
, &st
) >= 0 &&
732 (*journal_stream_ino
== 0 || fileno
== STDERR_FILENO
)) {
733 *journal_stream_dev
= st
.st_dev
;
734 *journal_stream_ino
= st
.st_ino
;
739 case EXEC_OUTPUT_SOCKET
:
740 assert(socket_fd
>= 0);
742 return RET_NERRNO(dup2(socket_fd
, fileno
));
744 case EXEC_OUTPUT_NAMED_FD
:
745 assert(named_iofds
[fileno
] >= 0);
747 (void) fd_nonblock(named_iofds
[fileno
], false);
748 return RET_NERRNO(dup2(named_iofds
[fileno
], fileno
));
750 case EXEC_OUTPUT_FILE
:
751 case EXEC_OUTPUT_FILE_APPEND
:
752 case EXEC_OUTPUT_FILE_TRUNCATE
: {
756 assert(context
->stdio_file
[fileno
]);
758 rw
= context
->std_input
== EXEC_INPUT_FILE
&&
759 streq_ptr(context
->stdio_file
[fileno
], context
->stdio_file
[STDIN_FILENO
]);
762 return RET_NERRNO(dup2(STDIN_FILENO
, fileno
));
765 if (o
== EXEC_OUTPUT_FILE_APPEND
)
767 else if (o
== EXEC_OUTPUT_FILE_TRUNCATE
)
770 fd
= acquire_path(context
->stdio_file
[fileno
], flags
, 0666 & ~context
->umask
);
774 return move_fd(fd
, fileno
, 0);
778 assert_not_reached();
782 static int chown_terminal(int fd
, uid_t uid
) {
787 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
788 if (isatty(fd
) < 1) {
789 if (IN_SET(errno
, EINVAL
, ENOTTY
))
790 return 0; /* not a tty */
795 /* This might fail. What matters are the results. */
796 r
= fchmod_and_chown(fd
, TTY_MODE
, uid
, GID_INVALID
);
803 static int setup_confirm_stdio(
804 const ExecContext
*context
,
806 int *ret_saved_stdin
,
807 int *ret_saved_stdout
) {
809 _cleanup_close_
int fd
= -EBADF
, saved_stdin
= -EBADF
, saved_stdout
= -EBADF
;
813 assert(ret_saved_stdin
);
814 assert(ret_saved_stdout
);
816 saved_stdin
= fcntl(STDIN_FILENO
, F_DUPFD
, 3);
820 saved_stdout
= fcntl(STDOUT_FILENO
, F_DUPFD
, 3);
821 if (saved_stdout
< 0)
824 fd
= acquire_terminal(vc
, ACQUIRE_TERMINAL_WAIT
, DEFAULT_CONFIRM_USEC
);
828 r
= chown_terminal(fd
, getuid());
832 r
= reset_terminal_fd(fd
, true);
836 r
= exec_context_tty_size(context
, &rows
, &cols
);
840 r
= terminal_set_size_fd(fd
, vc
, rows
, cols
);
844 r
= rearrange_stdio(fd
, fd
, STDERR_FILENO
); /* Invalidates 'fd' also on failure */
849 *ret_saved_stdin
= TAKE_FD(saved_stdin
);
850 *ret_saved_stdout
= TAKE_FD(saved_stdout
);
854 static void write_confirm_error_fd(int err
, int fd
, const Unit
*u
) {
857 if (err
== -ETIMEDOUT
)
858 dprintf(fd
, "Confirmation question timed out for %s, assuming positive response.\n", u
->id
);
861 dprintf(fd
, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u
->id
);
865 static void write_confirm_error(int err
, const char *vc
, const Unit
*u
) {
866 _cleanup_close_
int fd
= -EBADF
;
870 fd
= open_terminal(vc
, O_WRONLY
|O_NOCTTY
|O_CLOEXEC
);
874 write_confirm_error_fd(err
, fd
, u
);
877 static int restore_confirm_stdio(int *saved_stdin
, int *saved_stdout
) {
881 assert(saved_stdout
);
885 if (*saved_stdin
>= 0)
886 if (dup2(*saved_stdin
, STDIN_FILENO
) < 0)
889 if (*saved_stdout
>= 0)
890 if (dup2(*saved_stdout
, STDOUT_FILENO
) < 0)
893 *saved_stdin
= safe_close(*saved_stdin
);
894 *saved_stdout
= safe_close(*saved_stdout
);
900 CONFIRM_PRETEND_FAILURE
= -1,
901 CONFIRM_PRETEND_SUCCESS
= 0,
905 static int ask_for_confirmation(const ExecContext
*context
, const char *vc
, Unit
*u
, const char *cmdline
) {
906 int saved_stdout
= -1, saved_stdin
= -1, r
;
907 _cleanup_free_
char *e
= NULL
;
910 /* For any internal errors, assume a positive response. */
911 r
= setup_confirm_stdio(context
, vc
, &saved_stdin
, &saved_stdout
);
913 write_confirm_error(r
, vc
, u
);
914 return CONFIRM_EXECUTE
;
917 /* confirm_spawn might have been disabled while we were sleeping. */
918 if (manager_is_confirm_spawn_disabled(u
->manager
)) {
923 e
= ellipsize(cmdline
, 60, 100);
931 r
= ask_char(&c
, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e
);
933 write_confirm_error_fd(r
, STDOUT_FILENO
, u
);
940 printf("Resuming normal execution.\n");
941 manager_disable_confirm_spawn();
945 unit_dump(u
, stdout
, " ");
946 continue; /* ask again */
948 printf("Failing execution.\n");
949 r
= CONFIRM_PRETEND_FAILURE
;
952 printf(" c - continue, proceed without asking anymore\n"
953 " D - dump, show the state of the unit\n"
954 " f - fail, don't execute the command and pretend it failed\n"
956 " i - info, show a short summary of the unit\n"
957 " j - jobs, show jobs that are in progress\n"
958 " s - skip, don't execute the command and pretend it succeeded\n"
959 " y - yes, execute the command\n");
960 continue; /* ask again */
962 printf(" Description: %s\n"
965 u
->id
, u
->description
, cmdline
);
966 continue; /* ask again */
968 manager_dump_jobs(u
->manager
, stdout
, /* patterns= */ NULL
, " ");
969 continue; /* ask again */
971 /* 'n' was removed in favor of 'f'. */
972 printf("Didn't understand 'n', did you mean 'f'?\n");
973 continue; /* ask again */
975 printf("Skipping execution.\n");
976 r
= CONFIRM_PRETEND_SUCCESS
;
982 assert_not_reached();
988 restore_confirm_stdio(&saved_stdin
, &saved_stdout
);
992 static int get_fixed_user(
993 const char *username
,
994 const char **ret_user
,
997 const char **ret_home
,
998 const char **ret_shell
) {
1005 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
1006 * (i.e. are "/" or "/bin/nologin"). */
1008 r
= get_user_creds(&username
, ret_uid
, ret_gid
, ret_home
, ret_shell
, USER_CREDS_CLEAN
);
1012 *ret_user
= username
;
1016 static int get_fixed_group(
1017 const char *groupname
,
1018 const char **ret_group
,
1026 r
= get_group_creds(&groupname
, ret_gid
, /* flags = */ 0);
1030 *ret_group
= groupname
;
1034 static int get_supplementary_groups(const ExecContext
*c
, const char *user
,
1035 const char *group
, gid_t gid
,
1036 gid_t
**supplementary_gids
, int *ngids
) {
1039 bool keep_groups
= false;
1040 gid_t
*groups
= NULL
;
1041 _cleanup_free_ gid_t
*l_gids
= NULL
;
1046 * If user is given, then lookup GID and supplementary groups list.
1047 * We avoid NSS lookups for gid=0. Also we have to initialize groups
1048 * here and as early as possible so we keep the list of supplementary
1049 * groups of the caller.
1051 if (user
&& gid_is_valid(gid
) && gid
!= 0) {
1052 /* First step, initialize groups from /etc/groups */
1053 if (initgroups(user
, gid
) < 0)
1059 if (strv_isempty(c
->supplementary_groups
))
1063 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1064 * be positive, otherwise fail.
1067 ngroups_max
= (int) sysconf(_SC_NGROUPS_MAX
);
1068 if (ngroups_max
<= 0)
1069 return errno_or_else(EOPNOTSUPP
);
1071 l_gids
= new(gid_t
, ngroups_max
);
1077 * Lookup the list of groups that the user belongs to, we
1078 * avoid NSS lookups here too for gid=0.
1081 if (getgrouplist(user
, gid
, l_gids
, &k
) < 0)
1086 STRV_FOREACH(i
, c
->supplementary_groups
) {
1089 if (k
>= ngroups_max
)
1093 r
= get_group_creds(&g
, l_gids
+k
, 0);
1101 * Sets ngids to zero to drop all supplementary groups, happens
1102 * when we are under root and SupplementaryGroups= is empty.
1109 /* Otherwise get the final list of supplementary groups */
1110 groups
= memdup(l_gids
, sizeof(gid_t
) * k
);
1114 *supplementary_gids
= groups
;
1122 static int enforce_groups(gid_t gid
, const gid_t
*supplementary_gids
, int ngids
) {
1125 /* Handle SupplementaryGroups= if it is not empty */
1127 r
= maybe_setgroups(ngids
, supplementary_gids
);
1132 if (gid_is_valid(gid
)) {
1133 /* Then set our gids */
1134 if (setresgid(gid
, gid
, gid
) < 0)
1141 static int set_securebits(unsigned bits
, unsigned mask
) {
1145 current
= prctl(PR_GET_SECUREBITS
);
1149 /* Clear all securebits defined in mask and set bits */
1150 applied
= ((unsigned) current
& ~mask
) | bits
;
1151 if ((unsigned) current
== applied
)
1154 if (prctl(PR_SET_SECUREBITS
, applied
) < 0)
1160 static int enforce_user(
1161 const ExecContext
*context
,
1163 uint64_t capability_ambient_set
) {
1167 if (!uid_is_valid(uid
))
1170 /* Sets (but doesn't look up) the UIS and makes sure we keep the capabilities while doing so. For
1171 * setting secure bits the capability CAP_SETPCAP is required, so we also need keep-caps in this
1174 if ((capability_ambient_set
!= 0 || context
->secure_bits
!= 0) && uid
!= 0) {
1176 /* First step: If we need to keep capabilities but drop privileges we need to make sure we
1177 * keep our caps, while we drop privileges. Add KEEP_CAPS to the securebits */
1178 r
= set_securebits(1U << SECURE_KEEP_CAPS
, 0);
1183 /* Second step: actually set the uids */
1184 if (setresuid(uid
, uid
, uid
) < 0)
1187 /* At this point we should have all necessary capabilities but are otherwise a normal user. However,
1188 * the caps might got corrupted due to the setresuid() so we need clean them up later. This is done
1189 * outside of this call. */
1195 static int null_conv(
1197 const struct pam_message
**msg
,
1198 struct pam_response
**resp
,
1199 void *appdata_ptr
) {
1201 /* We don't support conversations */
1203 return PAM_CONV_ERR
;
1208 static int setup_pam(
1214 char ***env
, /* updated on success */
1215 const int fds
[], size_t n_fds
) {
1219 static const struct pam_conv conv
= {
1224 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
1225 _cleanup_strv_free_
char **e
= NULL
;
1226 pam_handle_t
*handle
= NULL
;
1228 int pam_code
= PAM_SUCCESS
, r
;
1229 bool close_session
= false;
1230 pid_t pam_pid
= 0, parent_pid
;
1237 /* We set up PAM in the parent process, then fork. The child
1238 * will then stay around until killed via PR_GET_PDEATHSIG or
1239 * systemd via the cgroup logic. It will then remove the PAM
1240 * session again. The parent process will exec() the actual
1241 * daemon. We do things this way to ensure that the main PID
1242 * of the daemon is the one we initially fork()ed. */
1244 r
= barrier_create(&barrier
);
1248 if (log_get_max_level() < LOG_DEBUG
)
1249 flags
|= PAM_SILENT
;
1251 pam_code
= pam_start(name
, user
, &conv
, &handle
);
1252 if (pam_code
!= PAM_SUCCESS
) {
1258 _cleanup_free_
char *q
= NULL
;
1260 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1261 * out if that's the case, and read the TTY off it. */
1263 if (getttyname_malloc(STDIN_FILENO
, &q
) >= 0)
1264 tty
= strjoina("/dev/", q
);
1268 pam_code
= pam_set_item(handle
, PAM_TTY
, tty
);
1269 if (pam_code
!= PAM_SUCCESS
)
1273 STRV_FOREACH(nv
, *env
) {
1274 pam_code
= pam_putenv(handle
, *nv
);
1275 if (pam_code
!= PAM_SUCCESS
)
1279 pam_code
= pam_acct_mgmt(handle
, flags
);
1280 if (pam_code
!= PAM_SUCCESS
)
1283 pam_code
= pam_setcred(handle
, PAM_ESTABLISH_CRED
| flags
);
1284 if (pam_code
!= PAM_SUCCESS
)
1285 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle
, pam_code
));
1287 pam_code
= pam_open_session(handle
, flags
);
1288 if (pam_code
!= PAM_SUCCESS
)
1291 close_session
= true;
1293 e
= pam_getenvlist(handle
);
1295 pam_code
= PAM_BUF_ERR
;
1299 /* Block SIGTERM, so that we know that it won't get lost in the child */
1301 assert_se(sigprocmask_many(SIG_BLOCK
, &old_ss
, SIGTERM
, -1) >= 0);
1303 parent_pid
= getpid_cached();
1305 r
= safe_fork("(sd-pam)", 0, &pam_pid
);
1309 int sig
, ret
= EXIT_PAM
;
1311 /* The child's job is to reset the PAM session on termination */
1312 barrier_set_role(&barrier
, BARRIER_CHILD
);
1314 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1315 * those fds are open here that have been opened by PAM. */
1316 (void) close_many(fds
, n_fds
);
1318 /* Drop privileges - we don't need any to pam_close_session and this will make
1319 * PR_SET_PDEATHSIG work in most cases. If this fails, ignore the error - but expect sd-pam
1320 * threads to fail to exit normally */
1322 r
= maybe_setgroups(0, NULL
);
1324 log_warning_errno(r
, "Failed to setgroups() in sd-pam: %m");
1325 if (setresgid(gid
, gid
, gid
) < 0)
1326 log_warning_errno(errno
, "Failed to setresgid() in sd-pam: %m");
1327 if (setresuid(uid
, uid
, uid
) < 0)
1328 log_warning_errno(errno
, "Failed to setresuid() in sd-pam: %m");
1330 (void) ignore_signals(SIGPIPE
);
1332 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1333 * otherwise the kernel will not allow unprivileged parents kill their privileged children
1334 * this way. We rely on the control groups kill logic to do the rest for us. */
1335 if (prctl(PR_SET_PDEATHSIG
, SIGTERM
) < 0)
1338 /* Tell the parent that our setup is done. This is especially important regarding dropping
1339 * privileges. Otherwise, unit setup might race against our setresuid(2) call.
1341 * If the parent aborted, we'll detect this below, hence ignore return failure here. */
1342 (void) barrier_place(&barrier
);
1344 /* Check if our parent process might already have died? */
1345 if (getppid() == parent_pid
) {
1348 assert_se(sigemptyset(&ss
) >= 0);
1349 assert_se(sigaddset(&ss
, SIGTERM
) >= 0);
1352 if (sigwait(&ss
, &sig
) < 0) {
1359 assert(sig
== SIGTERM
);
1364 pam_code
= pam_setcred(handle
, PAM_DELETE_CRED
| flags
);
1365 if (pam_code
!= PAM_SUCCESS
)
1368 /* If our parent died we'll end the session */
1369 if (getppid() != parent_pid
) {
1370 pam_code
= pam_close_session(handle
, flags
);
1371 if (pam_code
!= PAM_SUCCESS
)
1378 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1379 * know about this. See pam_end(3) */
1380 (void) pam_end(handle
, pam_code
| flags
| PAM_DATA_SILENT
);
1384 barrier_set_role(&barrier
, BARRIER_PARENT
);
1386 /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1390 /* Unblock SIGTERM again in the parent */
1391 assert_se(sigprocmask(SIG_SETMASK
, &old_ss
, NULL
) >= 0);
1393 /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1394 * this fd around. */
1397 /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1398 * recover. However, warn loudly if it happens. */
1399 if (!barrier_place_and_sync(&barrier
))
1400 log_error("PAM initialization failed");
1402 return strv_free_and_replace(*env
, e
);
1405 if (pam_code
!= PAM_SUCCESS
) {
1406 log_error("PAM failed: %s", pam_strerror(handle
, pam_code
));
1407 r
= -EPERM
; /* PAM errors do not map to errno */
1409 log_error_errno(r
, "PAM failed: %m");
1413 pam_code
= pam_close_session(handle
, flags
);
1415 (void) pam_end(handle
, pam_code
| flags
);
1425 static void rename_process_from_path(const char *path
) {
1426 _cleanup_free_
char *buf
= NULL
;
1431 /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
1434 if (path_extract_filename(path
, &buf
) < 0) {
1435 rename_process("(...)");
1439 size_t l
= strlen(buf
);
1441 /* The end of the process name is usually more interesting, since the first bit might just be
1448 char process_name
[11];
1449 process_name
[0] = '(';
1450 memcpy(process_name
+1, p
, l
);
1451 process_name
[1+l
] = ')';
1452 process_name
[1+l
+1] = 0;
1454 rename_process(process_name
);
1457 static bool context_has_address_families(const ExecContext
*c
) {
1460 return c
->address_families_allow_list
||
1461 !set_isempty(c
->address_families
);
1464 static bool context_has_syscall_filters(const ExecContext
*c
) {
1467 return c
->syscall_allow_list
||
1468 !hashmap_isempty(c
->syscall_filter
);
1471 static bool context_has_syscall_logs(const ExecContext
*c
) {
1474 return c
->syscall_log_allow_list
||
1475 !hashmap_isempty(c
->syscall_log
);
1478 static bool context_has_no_new_privileges(const ExecContext
*c
) {
1481 if (c
->no_new_privileges
)
1484 if (have_effective_cap(CAP_SYS_ADMIN
) > 0) /* if we are privileged, we don't need NNP */
1487 /* We need NNP if we have any form of seccomp and are unprivileged */
1488 return c
->lock_personality
||
1489 c
->memory_deny_write_execute
||
1490 c
->private_devices
||
1492 c
->protect_hostname
||
1493 c
->protect_kernel_tunables
||
1494 c
->protect_kernel_modules
||
1495 c
->protect_kernel_logs
||
1496 context_has_address_families(c
) ||
1497 exec_context_restrict_namespaces_set(c
) ||
1498 c
->restrict_realtime
||
1499 c
->restrict_suid_sgid
||
1500 !set_isempty(c
->syscall_archs
) ||
1501 context_has_syscall_filters(c
) ||
1502 context_has_syscall_logs(c
);
1507 static bool skip_seccomp_unavailable(const Unit
* u
, const char* msg
) {
1509 if (is_seccomp_available())
1512 log_unit_debug(u
, "SECCOMP features not detected in the kernel, skipping %s", msg
);
1516 static int apply_syscall_filter(const Unit
* u
, const ExecContext
*c
, bool needs_ambient_hack
) {
1517 uint32_t negative_action
, default_action
, action
;
1523 if (!context_has_syscall_filters(c
))
1526 if (skip_seccomp_unavailable(u
, "SystemCallFilter="))
1529 negative_action
= c
->syscall_errno
== SECCOMP_ERROR_NUMBER_KILL
? scmp_act_kill_process() : SCMP_ACT_ERRNO(c
->syscall_errno
);
1531 if (c
->syscall_allow_list
) {
1532 default_action
= negative_action
;
1533 action
= SCMP_ACT_ALLOW
;
1535 default_action
= SCMP_ACT_ALLOW
;
1536 action
= negative_action
;
1539 if (needs_ambient_hack
) {
1540 r
= seccomp_filter_set_add(c
->syscall_filter
, c
->syscall_allow_list
, syscall_filter_sets
+ SYSCALL_FILTER_SET_SETUID
);
1545 return seccomp_load_syscall_filter_set_raw(default_action
, c
->syscall_filter
, action
, false);
1548 static int apply_syscall_log(const Unit
* u
, const ExecContext
*c
) {
1550 uint32_t default_action
, action
;
1556 if (!context_has_syscall_logs(c
))
1560 if (skip_seccomp_unavailable(u
, "SystemCallLog="))
1563 if (c
->syscall_log_allow_list
) {
1564 /* Log nothing but the ones listed */
1565 default_action
= SCMP_ACT_ALLOW
;
1566 action
= SCMP_ACT_LOG
;
1568 /* Log everything but the ones listed */
1569 default_action
= SCMP_ACT_LOG
;
1570 action
= SCMP_ACT_ALLOW
;
1573 return seccomp_load_syscall_filter_set_raw(default_action
, c
->syscall_log
, action
, false);
1575 /* old libseccomp */
1576 log_unit_debug(u
, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1581 static int apply_syscall_archs(const Unit
*u
, const ExecContext
*c
) {
1585 if (set_isempty(c
->syscall_archs
))
1588 if (skip_seccomp_unavailable(u
, "SystemCallArchitectures="))
1591 return seccomp_restrict_archs(c
->syscall_archs
);
1594 static int apply_address_families(const Unit
* u
, const ExecContext
*c
) {
1598 if (!context_has_address_families(c
))
1601 if (skip_seccomp_unavailable(u
, "RestrictAddressFamilies="))
1604 return seccomp_restrict_address_families(c
->address_families
, c
->address_families_allow_list
);
1607 static int apply_memory_deny_write_execute(const Unit
* u
, const ExecContext
*c
) {
1613 if (!c
->memory_deny_write_execute
)
1616 /* use prctl() if kernel supports it (6.3) */
1617 r
= prctl(PR_SET_MDWE
, PR_MDWE_REFUSE_EXEC_GAIN
, 0, 0, 0);
1619 log_unit_debug(u
, "Enabled MemoryDenyWriteExecute= with PR_SET_MDWE");
1622 if (r
< 0 && errno
!= EINVAL
)
1623 return log_unit_debug_errno(u
, errno
, "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m");
1624 /* else use seccomp */
1625 log_unit_debug(u
, "Kernel doesn't support PR_SET_MDWE: falling back to seccomp");
1627 if (skip_seccomp_unavailable(u
, "MemoryDenyWriteExecute="))
1630 return seccomp_memory_deny_write_execute();
1633 static int apply_restrict_realtime(const Unit
* u
, const ExecContext
*c
) {
1637 if (!c
->restrict_realtime
)
1640 if (skip_seccomp_unavailable(u
, "RestrictRealtime="))
1643 return seccomp_restrict_realtime();
1646 static int apply_restrict_suid_sgid(const Unit
* u
, const ExecContext
*c
) {
1650 if (!c
->restrict_suid_sgid
)
1653 if (skip_seccomp_unavailable(u
, "RestrictSUIDSGID="))
1656 return seccomp_restrict_suid_sgid();
1659 static int apply_protect_sysctl(const Unit
*u
, const ExecContext
*c
) {
1663 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1664 * let's protect even those systems where this is left on in the kernel. */
1666 if (!c
->protect_kernel_tunables
)
1669 if (skip_seccomp_unavailable(u
, "ProtectKernelTunables="))
1672 return seccomp_protect_sysctl();
1675 static int apply_protect_kernel_modules(const Unit
*u
, const ExecContext
*c
) {
1679 /* Turn off module syscalls on ProtectKernelModules=yes */
1681 if (!c
->protect_kernel_modules
)
1684 if (skip_seccomp_unavailable(u
, "ProtectKernelModules="))
1687 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_MODULE
, SCMP_ACT_ERRNO(EPERM
), false);
1690 static int apply_protect_kernel_logs(const Unit
*u
, const ExecContext
*c
) {
1694 if (!c
->protect_kernel_logs
)
1697 if (skip_seccomp_unavailable(u
, "ProtectKernelLogs="))
1700 return seccomp_protect_syslog();
1703 static int apply_protect_clock(const Unit
*u
, const ExecContext
*c
) {
1707 if (!c
->protect_clock
)
1710 if (skip_seccomp_unavailable(u
, "ProtectClock="))
1713 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_CLOCK
, SCMP_ACT_ERRNO(EPERM
), false);
1716 static int apply_private_devices(const Unit
*u
, const ExecContext
*c
) {
1720 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1722 if (!c
->private_devices
)
1725 if (skip_seccomp_unavailable(u
, "PrivateDevices="))
1728 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_RAW_IO
, SCMP_ACT_ERRNO(EPERM
), false);
1731 static int apply_restrict_namespaces(const Unit
*u
, const ExecContext
*c
) {
1735 if (!exec_context_restrict_namespaces_set(c
))
1738 if (skip_seccomp_unavailable(u
, "RestrictNamespaces="))
1741 return seccomp_restrict_namespaces(c
->restrict_namespaces
);
1744 static int apply_lock_personality(const Unit
* u
, const ExecContext
*c
) {
1745 unsigned long personality
;
1751 if (!c
->lock_personality
)
1754 if (skip_seccomp_unavailable(u
, "LockPersonality="))
1757 personality
= c
->personality
;
1759 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1760 if (personality
== PERSONALITY_INVALID
) {
1762 r
= opinionated_personality(&personality
);
1767 return seccomp_lock_personality(personality
);
1773 static int apply_restrict_filesystems(Unit
*u
, const ExecContext
*c
) {
1777 if (!exec_context_restrict_filesystems_set(c
))
1780 if (!u
->manager
->restrict_fs
) {
1781 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1782 log_unit_debug(u
, "LSM BPF not supported, skipping RestrictFileSystems=");
1786 return lsm_bpf_unit_restrict_filesystems(u
, c
->restrict_filesystems
, c
->restrict_filesystems_allow_list
);
1790 static int apply_protect_hostname(const Unit
*u
, const ExecContext
*c
, int *ret_exit_status
) {
1794 if (!c
->protect_hostname
)
1797 if (ns_type_supported(NAMESPACE_UTS
)) {
1798 if (unshare(CLONE_NEWUTS
) < 0) {
1799 if (!ERRNO_IS_NOT_SUPPORTED(errno
) && !ERRNO_IS_PRIVILEGE(errno
)) {
1800 *ret_exit_status
= EXIT_NAMESPACE
;
1801 return log_unit_error_errno(u
, errno
, "Failed to set up UTS namespacing: %m");
1804 log_unit_warning(u
, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1807 log_unit_warning(u
, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1812 if (skip_seccomp_unavailable(u
, "ProtectHostname="))
1815 r
= seccomp_protect_hostname();
1817 *ret_exit_status
= EXIT_SECCOMP
;
1818 return log_unit_error_errno(u
, r
, "Failed to apply hostname restrictions: %m");
1825 static void do_idle_pipe_dance(int idle_pipe
[static 4]) {
1828 idle_pipe
[1] = safe_close(idle_pipe
[1]);
1829 idle_pipe
[2] = safe_close(idle_pipe
[2]);
1831 if (idle_pipe
[0] >= 0) {
1834 r
= fd_wait_for_event(idle_pipe
[0], POLLHUP
, IDLE_TIMEOUT_USEC
);
1836 if (idle_pipe
[3] >= 0 && r
== 0 /* timeout */) {
1839 /* Signal systemd that we are bored and want to continue. */
1840 n
= write(idle_pipe
[3], "x", 1);
1842 /* Wait for systemd to react to the signal above. */
1843 (void) fd_wait_for_event(idle_pipe
[0], POLLHUP
, IDLE_TIMEOUT2_USEC
);
1846 idle_pipe
[0] = safe_close(idle_pipe
[0]);
1850 idle_pipe
[3] = safe_close(idle_pipe
[3]);
1853 static const char *exec_directory_env_name_to_string(ExecDirectoryType t
);
1855 static int build_environment(
1857 const ExecContext
*c
,
1858 const ExecParameters
*p
,
1859 const CGroupContext
*cgroup_context
,
1863 const char *username
,
1865 dev_t journal_stream_dev
,
1866 ino_t journal_stream_ino
,
1867 const char *memory_pressure_path
,
1870 _cleanup_strv_free_
char **our_env
= NULL
;
1880 #define N_ENV_VARS 19
1881 our_env
= new0(char*, N_ENV_VARS
+ _EXEC_DIRECTORY_TYPE_MAX
);
1886 _cleanup_free_
char *joined
= NULL
;
1888 if (asprintf(&x
, "LISTEN_PID="PID_FMT
, getpid_cached()) < 0)
1890 our_env
[n_env
++] = x
;
1892 if (asprintf(&x
, "LISTEN_FDS=%zu", n_fds
) < 0)
1894 our_env
[n_env
++] = x
;
1896 joined
= strv_join(fdnames
, ":");
1900 x
= strjoin("LISTEN_FDNAMES=", joined
);
1903 our_env
[n_env
++] = x
;
1906 if ((p
->flags
& EXEC_SET_WATCHDOG
) && p
->watchdog_usec
> 0) {
1907 if (asprintf(&x
, "WATCHDOG_PID="PID_FMT
, getpid_cached()) < 0)
1909 our_env
[n_env
++] = x
;
1911 if (asprintf(&x
, "WATCHDOG_USEC="USEC_FMT
, p
->watchdog_usec
) < 0)
1913 our_env
[n_env
++] = x
;
1916 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1917 * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1918 * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1919 if (p
->flags
& EXEC_NSS_DYNAMIC_BYPASS
) {
1920 x
= strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
1923 our_env
[n_env
++] = x
;
1926 /* We query "root" if this is a system unit and User= is not specified. $USER is always set. $HOME
1927 * could cause problem for e.g. getty, since login doesn't override $HOME, and $LOGNAME and $SHELL don't
1928 * really make much sense since we're not logged in. Hence we conditionalize the three based on
1929 * SetLoginEnvironment= switch. */
1930 if (!c
->user
&& !c
->dynamic_user
&& p
->runtime_scope
== RUNTIME_SCOPE_SYSTEM
) {
1931 r
= get_fixed_user("root", &username
, NULL
, NULL
, &home
, &shell
);
1933 return log_unit_error_errno(u
, r
, "Failed to determine user credentials for root: %m");
1936 bool set_user_login_env
= c
->set_login_environment
>= 0 ? c
->set_login_environment
: (c
->user
|| c
->dynamic_user
);
1939 x
= strjoin("USER=", username
);
1942 our_env
[n_env
++] = x
;
1944 if (set_user_login_env
) {
1945 x
= strjoin("LOGNAME=", username
);
1948 our_env
[n_env
++] = x
;
1952 if (home
&& set_user_login_env
) {
1953 x
= strjoin("HOME=", home
);
1957 path_simplify(x
+ 5);
1958 our_env
[n_env
++] = x
;
1961 if (shell
&& set_user_login_env
) {
1962 x
= strjoin("SHELL=", shell
);
1966 path_simplify(x
+ 6);
1967 our_env
[n_env
++] = x
;
1970 if (!sd_id128_is_null(u
->invocation_id
)) {
1971 if (asprintf(&x
, "INVOCATION_ID=" SD_ID128_FORMAT_STR
, SD_ID128_FORMAT_VAL(u
->invocation_id
)) < 0)
1974 our_env
[n_env
++] = x
;
1977 if (exec_context_needs_term(c
)) {
1978 _cleanup_free_
char *cmdline
= NULL
;
1979 const char *tty_path
, *term
= NULL
;
1981 tty_path
= exec_context_tty_path(c
);
1983 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1984 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1985 * container manager passes to PID 1 ends up all the way in the console login shown. */
1987 if (path_equal_ptr(tty_path
, "/dev/console") && getppid() == 1)
1988 term
= getenv("TERM");
1989 else if (tty_path
&& in_charset(skip_dev_prefix(tty_path
), ALPHANUMERICAL
)) {
1990 _cleanup_free_
char *key
= NULL
;
1992 key
= strjoin("systemd.tty.term.", skip_dev_prefix(tty_path
));
1996 r
= proc_cmdline_get_key(key
, 0, &cmdline
);
1998 log_debug_errno(r
, "Failed to read %s from kernel cmdline, ignoring: %m", key
);
2004 term
= default_term_for_tty(tty_path
);
2006 x
= strjoin("TERM=", term
);
2009 our_env
[n_env
++] = x
;
2012 if (journal_stream_dev
!= 0 && journal_stream_ino
!= 0) {
2013 if (asprintf(&x
, "JOURNAL_STREAM=" DEV_FMT
":" INO_FMT
, journal_stream_dev
, journal_stream_ino
) < 0)
2016 our_env
[n_env
++] = x
;
2019 if (c
->log_namespace
) {
2020 x
= strjoin("LOG_NAMESPACE=", c
->log_namespace
);
2024 our_env
[n_env
++] = x
;
2027 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2028 _cleanup_free_
char *joined
= NULL
;
2034 if (c
->directories
[t
].n_items
== 0)
2037 n
= exec_directory_env_name_to_string(t
);
2041 for (size_t i
= 0; i
< c
->directories
[t
].n_items
; i
++) {
2042 _cleanup_free_
char *prefixed
= NULL
;
2044 prefixed
= path_join(p
->prefix
[t
], c
->directories
[t
].items
[i
].path
);
2048 if (!strextend_with_separator(&joined
, ":", prefixed
))
2052 x
= strjoin(n
, "=", joined
);
2056 our_env
[n_env
++] = x
;
2059 _cleanup_free_
char *creds_dir
= NULL
;
2060 r
= exec_context_get_credential_directory(c
, p
, u
->id
, &creds_dir
);
2064 x
= strjoin("CREDENTIALS_DIRECTORY=", creds_dir
);
2068 our_env
[n_env
++] = x
;
2071 if (asprintf(&x
, "SYSTEMD_EXEC_PID=" PID_FMT
, getpid_cached()) < 0)
2074 our_env
[n_env
++] = x
;
2076 if (memory_pressure_path
) {
2077 x
= strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path
);
2081 our_env
[n_env
++] = x
;
2083 if (cgroup_context
&& !path_equal(memory_pressure_path
, "/dev/null")) {
2084 _cleanup_free_
char *b
= NULL
, *e
= NULL
;
2086 if (asprintf(&b
, "%s " USEC_FMT
" " USEC_FMT
,
2087 MEMORY_PRESSURE_DEFAULT_TYPE
,
2088 cgroup_context
->memory_pressure_threshold_usec
== USEC_INFINITY
? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC
:
2089 CLAMP(cgroup_context
->memory_pressure_threshold_usec
, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC
),
2090 MEMORY_PRESSURE_DEFAULT_WINDOW_USEC
) < 0)
2093 if (base64mem(b
, strlen(b
) + 1, &e
) < 0)
2096 x
= strjoin("MEMORY_PRESSURE_WRITE=", e
);
2100 our_env
[n_env
++] = x
;
2104 assert(n_env
< N_ENV_VARS
+ _EXEC_DIRECTORY_TYPE_MAX
);
2107 *ret
= TAKE_PTR(our_env
);
2112 static int build_pass_environment(const ExecContext
*c
, char ***ret
) {
2113 _cleanup_strv_free_
char **pass_env
= NULL
;
2116 STRV_FOREACH(i
, c
->pass_environment
) {
2117 _cleanup_free_
char *x
= NULL
;
2123 x
= strjoin(*i
, "=", v
);
2127 if (!GREEDY_REALLOC(pass_env
, n_env
+ 2))
2130 pass_env
[n_env
++] = TAKE_PTR(x
);
2131 pass_env
[n_env
] = NULL
;
2134 *ret
= TAKE_PTR(pass_env
);
2139 bool exec_needs_network_namespace(const ExecContext
*context
) {
2142 return context
->private_network
|| context
->network_namespace_path
;
2145 static bool exec_needs_ephemeral(const ExecContext
*context
) {
2146 return (context
->root_image
|| context
->root_directory
) && context
->root_ephemeral
;
2149 static bool exec_needs_ipc_namespace(const ExecContext
*context
) {
2152 return context
->private_ipc
|| context
->ipc_namespace_path
;
2155 bool exec_needs_mount_namespace(
2156 const ExecContext
*context
,
2157 const ExecParameters
*params
,
2158 const ExecRuntime
*runtime
) {
2162 if (context
->root_image
)
2165 if (!strv_isempty(context
->read_write_paths
) ||
2166 !strv_isempty(context
->read_only_paths
) ||
2167 !strv_isempty(context
->inaccessible_paths
) ||
2168 !strv_isempty(context
->exec_paths
) ||
2169 !strv_isempty(context
->no_exec_paths
))
2172 if (context
->n_bind_mounts
> 0)
2175 if (context
->n_temporary_filesystems
> 0)
2178 if (context
->n_mount_images
> 0)
2181 if (context
->n_extension_images
> 0)
2184 if (!strv_isempty(context
->extension_directories
))
2187 if (!IN_SET(context
->mount_propagation_flag
, 0, MS_SHARED
))
2190 if (context
->private_tmp
&& runtime
&& runtime
->shared
&& (runtime
->shared
->tmp_dir
|| runtime
->shared
->var_tmp_dir
))
2193 if (context
->private_devices
||
2194 context
->private_mounts
> 0 ||
2195 (context
->private_mounts
< 0 && exec_needs_network_namespace(context
)) ||
2196 context
->protect_system
!= PROTECT_SYSTEM_NO
||
2197 context
->protect_home
!= PROTECT_HOME_NO
||
2198 context
->protect_kernel_tunables
||
2199 context
->protect_kernel_modules
||
2200 context
->protect_kernel_logs
||
2201 context
->protect_control_groups
||
2202 context
->protect_proc
!= PROTECT_PROC_DEFAULT
||
2203 context
->proc_subset
!= PROC_SUBSET_ALL
||
2204 exec_needs_ipc_namespace(context
))
2207 if (context
->root_directory
) {
2208 if (exec_context_get_effective_mount_apivfs(context
))
2211 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2212 if (params
&& !params
->prefix
[t
])
2215 if (context
->directories
[t
].n_items
> 0)
2220 if (context
->dynamic_user
&&
2221 (context
->directories
[EXEC_DIRECTORY_STATE
].n_items
> 0 ||
2222 context
->directories
[EXEC_DIRECTORY_CACHE
].n_items
> 0 ||
2223 context
->directories
[EXEC_DIRECTORY_LOGS
].n_items
> 0))
2226 if (context
->log_namespace
)
2232 static int setup_private_users(uid_t ouid
, gid_t ogid
, uid_t uid
, gid_t gid
) {
2233 _cleanup_free_
char *uid_map
= NULL
, *gid_map
= NULL
;
2234 _cleanup_close_pair_
int errno_pipe
[2] = PIPE_EBADF
;
2235 _cleanup_close_
int unshare_ready_fd
= -EBADF
;
2236 _cleanup_(sigkill_waitp
) pid_t pid
= 0;
2241 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2242 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2243 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2244 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2245 * which waits for the parent to create the new user namespace while staying in the original namespace. The
2246 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2247 * continues execution normally.
2248 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2249 * does not need CAP_SETUID to write the single line mapping to itself. */
2251 /* Can only set up multiple mappings with CAP_SETUID. */
2252 if (have_effective_cap(CAP_SETUID
) > 0 && uid
!= ouid
&& uid_is_valid(uid
))
2253 r
= asprintf(&uid_map
,
2254 UID_FMT
" " UID_FMT
" 1\n" /* Map $OUID → $OUID */
2255 UID_FMT
" " UID_FMT
" 1\n", /* Map $UID → $UID */
2256 ouid
, ouid
, uid
, uid
);
2258 r
= asprintf(&uid_map
,
2259 UID_FMT
" " UID_FMT
" 1\n", /* Map $OUID → $OUID */
2265 /* Can only set up multiple mappings with CAP_SETGID. */
2266 if (have_effective_cap(CAP_SETGID
) > 0 && gid
!= ogid
&& gid_is_valid(gid
))
2267 r
= asprintf(&gid_map
,
2268 GID_FMT
" " GID_FMT
" 1\n" /* Map $OGID → $OGID */
2269 GID_FMT
" " GID_FMT
" 1\n", /* Map $GID → $GID */
2270 ogid
, ogid
, gid
, gid
);
2272 r
= asprintf(&gid_map
,
2273 GID_FMT
" " GID_FMT
" 1\n", /* Map $OGID -> $OGID */
2279 /* Create a communication channel so that the parent can tell the child when it finished creating the user
2281 unshare_ready_fd
= eventfd(0, EFD_CLOEXEC
);
2282 if (unshare_ready_fd
< 0)
2285 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2287 if (pipe2(errno_pipe
, O_CLOEXEC
) < 0)
2290 r
= safe_fork("(sd-userns)", FORK_RESET_SIGNALS
|FORK_DEATHSIG
, &pid
);
2294 _cleanup_close_
int fd
= -EBADF
;
2298 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2299 * here, after the parent opened its own user namespace. */
2302 errno_pipe
[0] = safe_close(errno_pipe
[0]);
2304 /* Wait until the parent unshared the user namespace */
2305 if (read(unshare_ready_fd
, &c
, sizeof(c
)) < 0) {
2310 /* Disable the setgroups() system call in the child user namespace, for good. */
2311 a
= procfs_file_alloca(ppid
, "setgroups");
2312 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
2314 if (errno
!= ENOENT
) {
2319 /* If the file is missing the kernel is too old, let's continue anyway. */
2321 if (write(fd
, "deny\n", 5) < 0) {
2326 fd
= safe_close(fd
);
2329 /* First write the GID map */
2330 a
= procfs_file_alloca(ppid
, "gid_map");
2331 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
2336 if (write(fd
, gid_map
, strlen(gid_map
)) < 0) {
2340 fd
= safe_close(fd
);
2342 /* The write the UID map */
2343 a
= procfs_file_alloca(ppid
, "uid_map");
2344 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
2349 if (write(fd
, uid_map
, strlen(uid_map
)) < 0) {
2354 _exit(EXIT_SUCCESS
);
2357 (void) write(errno_pipe
[1], &r
, sizeof(r
));
2358 _exit(EXIT_FAILURE
);
2361 errno_pipe
[1] = safe_close(errno_pipe
[1]);
2363 if (unshare(CLONE_NEWUSER
) < 0)
2366 /* Let the child know that the namespace is ready now */
2367 if (write(unshare_ready_fd
, &c
, sizeof(c
)) < 0)
2370 /* Try to read an error code from the child */
2371 n
= read(errno_pipe
[0], &r
, sizeof(r
));
2374 if (n
== sizeof(r
)) { /* an error code was sent to us */
2379 if (n
!= 0) /* on success we should have read 0 bytes */
2382 r
= wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid
), 0);
2385 if (r
!= EXIT_SUCCESS
) /* If something strange happened with the child, let's consider this fatal, too */
2391 static bool exec_directory_is_private(const ExecContext
*context
, ExecDirectoryType type
) {
2394 if (!context
->dynamic_user
)
2397 if (type
== EXEC_DIRECTORY_CONFIGURATION
)
2400 if (type
== EXEC_DIRECTORY_RUNTIME
&& context
->runtime_directory_preserve_mode
== EXEC_PRESERVE_NO
)
2406 static int create_many_symlinks(const char *root
, const char *source
, char **symlinks
) {
2407 _cleanup_free_
char *src_abs
= NULL
;
2412 src_abs
= path_join(root
, source
);
2416 STRV_FOREACH(dst
, symlinks
) {
2417 _cleanup_free_
char *dst_abs
= NULL
;
2419 dst_abs
= path_join(root
, *dst
);
2423 r
= mkdir_parents_label(dst_abs
, 0755);
2427 r
= symlink_idempotent(src_abs
, dst_abs
, true);
2435 static int setup_exec_directory(
2437 const ExecContext
*context
,
2438 const ExecParameters
*params
,
2441 ExecDirectoryType type
,
2442 bool needs_mount_namespace
,
2445 static const int exit_status_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
2446 [EXEC_DIRECTORY_RUNTIME
] = EXIT_RUNTIME_DIRECTORY
,
2447 [EXEC_DIRECTORY_STATE
] = EXIT_STATE_DIRECTORY
,
2448 [EXEC_DIRECTORY_CACHE
] = EXIT_CACHE_DIRECTORY
,
2449 [EXEC_DIRECTORY_LOGS
] = EXIT_LOGS_DIRECTORY
,
2450 [EXEC_DIRECTORY_CONFIGURATION
] = EXIT_CONFIGURATION_DIRECTORY
,
2456 assert(type
>= 0 && type
< _EXEC_DIRECTORY_TYPE_MAX
);
2457 assert(exit_status
);
2459 if (!params
->prefix
[type
])
2462 if (params
->flags
& EXEC_CHOWN_DIRECTORIES
) {
2463 if (!uid_is_valid(uid
))
2465 if (!gid_is_valid(gid
))
2469 for (size_t i
= 0; i
< context
->directories
[type
].n_items
; i
++) {
2470 _cleanup_free_
char *p
= NULL
, *pp
= NULL
;
2472 p
= path_join(params
->prefix
[type
], context
->directories
[type
].items
[i
].path
);
2478 r
= mkdir_parents_label(p
, 0755);
2482 if (IN_SET(type
, EXEC_DIRECTORY_STATE
, EXEC_DIRECTORY_LOGS
) && params
->runtime_scope
== RUNTIME_SCOPE_USER
) {
2484 /* If we are in user mode, and a configuration directory exists but a state directory
2485 * doesn't exist, then we likely are upgrading from an older systemd version that
2486 * didn't know the more recent addition to the xdg-basedir spec: the $XDG_STATE_HOME
2487 * directory. In older systemd versions EXEC_DIRECTORY_STATE was aliased to
2488 * EXEC_DIRECTORY_CONFIGURATION, with the advent of $XDG_STATE_HOME is is now
2489 * separated. If a service has both dirs configured but only the configuration dir
2490 * exists and the state dir does not, we assume we are looking at an update
2491 * situation. Hence, create a compatibility symlink, so that all expectations are
2494 * (We also do something similar with the log directory, which still doesn't exist in
2495 * the xdg basedir spec. We'll make it a subdir of the state dir.) */
2497 /* this assumes the state dir is always created before the configuration dir */
2498 assert_cc(EXEC_DIRECTORY_STATE
< EXEC_DIRECTORY_LOGS
);
2499 assert_cc(EXEC_DIRECTORY_LOGS
< EXEC_DIRECTORY_CONFIGURATION
);
2501 r
= laccess(p
, F_OK
);
2503 _cleanup_free_
char *q
= NULL
;
2505 /* OK, we know that the state dir does not exist. Let's see if the dir exists
2506 * under the configuration hierarchy. */
2508 if (type
== EXEC_DIRECTORY_STATE
)
2509 q
= path_join(params
->prefix
[EXEC_DIRECTORY_CONFIGURATION
], context
->directories
[type
].items
[i
].path
);
2510 else if (type
== EXEC_DIRECTORY_LOGS
)
2511 q
= path_join(params
->prefix
[EXEC_DIRECTORY_CONFIGURATION
], "log", context
->directories
[type
].items
[i
].path
);
2513 assert_not_reached();
2519 r
= laccess(q
, F_OK
);
2521 /* It does exist! This hence looks like an update. Symlink the
2522 * configuration directory into the state directory. */
2524 r
= symlink_idempotent(q
, p
, /* make_relative= */ true);
2528 log_unit_notice(u
, "Unit state directory %s missing but matching configuration directory %s exists, assuming update from systemd 253 or older, creating compatibility symlink.", p
, q
);
2530 } else if (r
!= -ENOENT
)
2531 log_unit_warning_errno(u
, r
, "Unable to detect whether unit configuration directory '%s' exists, assuming not: %m", q
);
2534 log_unit_warning_errno(u
, r
, "Unable to detect whether unit state directory '%s' is missing, assuming it is: %m", p
);
2537 if (exec_directory_is_private(context
, type
)) {
2538 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2539 * case we want to avoid leaving a directory around fully accessible that is owned by
2540 * a dynamic user whose UID is later on reused. To lock this down we use the same
2541 * trick used by container managers to prohibit host users to get access to files of
2542 * the same UID in containers: we place everything inside a directory that has an
2543 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2544 * for unprivileged host code. We then use fs namespacing to make this directory
2545 * permeable for the service itself.
2547 * Specifically: for a service which wants a special directory "foo/" we first create
2548 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2549 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2550 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2551 * unprivileged host users can't look into it. Inside of the namespace of the unit
2552 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2553 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2554 * for the service and making sure it only gets access to the dirs it needs but no
2555 * others. Tricky? Yes, absolutely, but it works!
2557 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2558 * to be owned by the service itself.
2560 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2561 * for sharing files or sockets with other services. */
2563 pp
= path_join(params
->prefix
[type
], "private");
2569 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2570 r
= mkdir_safe_label(pp
, 0700, 0, 0, MKDIR_WARN_MODE
);
2574 if (!path_extend(&pp
, context
->directories
[type
].items
[i
].path
)) {
2579 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2580 r
= mkdir_parents_label(pp
, 0755);
2584 if (is_dir(p
, false) > 0 &&
2585 (laccess(pp
, F_OK
) == -ENOENT
)) {
2587 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2588 * it over. Most likely the service has been upgraded from one that didn't use
2589 * DynamicUser=1, to one that does. */
2591 log_unit_info(u
, "Found pre-existing public %s= directory %s, migrating to %s.\n"
2592 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2593 exec_directory_type_to_string(type
), p
, pp
);
2595 r
= RET_NERRNO(rename(p
, pp
));
2599 /* Otherwise, create the actual directory for the service */
2601 r
= mkdir_label(pp
, context
->directories
[type
].mode
);
2602 if (r
< 0 && r
!= -EEXIST
)
2606 if (!context
->directories
[type
].items
[i
].only_create
) {
2607 /* And link it up from the original place.
2609 * 1) If a mount namespace is going to be used, then this symlink remains on
2610 * the host, and a new one for the child namespace will be created later.
2611 * 2) It is not necessary to create this symlink when one of its parent
2612 * directories is specified and already created. E.g.
2613 * StateDirectory=foo foo/bar
2614 * In that case, the inode points to pp and p for "foo/bar" are the same:
2615 * pp = "/var/lib/private/foo/bar"
2616 * p = "/var/lib/foo/bar"
2617 * and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2618 * we do not need to create the symlink, but we cannot create the symlink.
2619 * See issue #24783. */
2620 r
= symlink_idempotent(pp
, p
, true);
2626 _cleanup_free_
char *target
= NULL
;
2628 if (type
!= EXEC_DIRECTORY_CONFIGURATION
&&
2629 readlink_and_make_absolute(p
, &target
) >= 0) {
2630 _cleanup_free_
char *q
= NULL
, *q_resolved
= NULL
, *target_resolved
= NULL
;
2632 /* This already exists and is a symlink? Interesting. Maybe it's one created
2633 * by DynamicUser=1 (see above)?
2635 * We do this for all directory types except for ConfigurationDirectory=,
2636 * since they all support the private/ symlink logic at least in some
2637 * configurations, see above. */
2639 r
= chase(target
, NULL
, 0, &target_resolved
, NULL
);
2643 q
= path_join(params
->prefix
[type
], "private", context
->directories
[type
].items
[i
].path
);
2649 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2650 r
= chase(q
, NULL
, CHASE_NONEXISTENT
, &q_resolved
, NULL
);
2654 if (path_equal(q_resolved
, target_resolved
)) {
2656 /* Hmm, apparently DynamicUser= was once turned on for this service,
2657 * but is no longer. Let's move the directory back up. */
2659 log_unit_info(u
, "Found pre-existing private %s= directory %s, migrating to %s.\n"
2660 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2661 exec_directory_type_to_string(type
), q
, p
);
2663 r
= RET_NERRNO(unlink(p
));
2667 r
= RET_NERRNO(rename(q
, p
));
2673 r
= mkdir_label(p
, context
->directories
[type
].mode
);
2678 if (type
== EXEC_DIRECTORY_CONFIGURATION
) {
2681 /* Don't change the owner/access mode of the configuration directory,
2682 * as in the common case it is not written to by a service, and shall
2683 * not be writable. */
2685 r
= RET_NERRNO(stat(p
, &st
));
2689 /* Still complain if the access mode doesn't match */
2690 if (((st
.st_mode
^ context
->directories
[type
].mode
) & 07777) != 0)
2691 log_unit_warning(u
, "%s \'%s\' already exists but the mode is different. "
2692 "(File system: %o %sMode: %o)",
2693 exec_directory_type_to_string(type
), context
->directories
[type
].items
[i
].path
,
2694 st
.st_mode
& 07777, exec_directory_type_to_string(type
), context
->directories
[type
].mode
& 07777);
2701 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2702 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2703 * current UID/GID ownership.) */
2704 r
= chmod_and_chown(pp
?: p
, context
->directories
[type
].mode
, UID_INVALID
, GID_INVALID
);
2708 /* Skip the rest (which deals with ownership) in user mode, since ownership changes are not
2709 * available to user code anyway */
2710 if (params
->runtime_scope
!= RUNTIME_SCOPE_SYSTEM
)
2713 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2714 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2715 * assignments to exist. */
2716 r
= path_chown_recursive(pp
?: p
, uid
, gid
, context
->dynamic_user
? 01777 : 07777, AT_SYMLINK_FOLLOW
);
2721 /* If we are not going to run in a namespace, set up the symlinks - otherwise
2722 * they are set up later, to allow configuring empty var/run/etc. */
2723 if (!needs_mount_namespace
)
2724 for (size_t i
= 0; i
< context
->directories
[type
].n_items
; i
++) {
2725 r
= create_many_symlinks(params
->prefix
[type
],
2726 context
->directories
[type
].items
[i
].path
,
2727 context
->directories
[type
].items
[i
].symlinks
);
2735 *exit_status
= exit_status_table
[type
];
2740 static int setup_smack(
2741 const Manager
*manager
,
2742 const ExecContext
*context
,
2743 int executable_fd
) {
2747 assert(executable_fd
>= 0);
2749 if (context
->smack_process_label
) {
2750 r
= mac_smack_apply_pid(0, context
->smack_process_label
);
2753 } else if (manager
->defaults
.smack_process_label
) {
2754 _cleanup_free_
char *exec_label
= NULL
;
2756 r
= mac_smack_read_fd(executable_fd
, SMACK_ATTR_EXEC
, &exec_label
);
2757 if (r
< 0 && !ERRNO_IS_XATTR_ABSENT(r
))
2760 r
= mac_smack_apply_pid(0, exec_label
?: manager
->defaults
.smack_process_label
);
2769 static int compile_bind_mounts(
2770 const ExecContext
*context
,
2771 const ExecParameters
*params
,
2772 BindMount
**ret_bind_mounts
,
2773 size_t *ret_n_bind_mounts
,
2774 char ***ret_empty_directories
) {
2776 _cleanup_strv_free_
char **empty_directories
= NULL
;
2777 BindMount
*bind_mounts
= NULL
;
2783 assert(ret_bind_mounts
);
2784 assert(ret_n_bind_mounts
);
2785 assert(ret_empty_directories
);
2787 CLEANUP_ARRAY(bind_mounts
, h
, bind_mount_free_many
);
2789 n
= context
->n_bind_mounts
;
2790 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2791 if (!params
->prefix
[t
])
2794 for (size_t i
= 0; i
< context
->directories
[t
].n_items
; i
++)
2795 n
+= !context
->directories
[t
].items
[i
].only_create
;
2799 *ret_bind_mounts
= NULL
;
2800 *ret_n_bind_mounts
= 0;
2801 *ret_empty_directories
= NULL
;
2805 bind_mounts
= new(BindMount
, n
);
2809 for (size_t i
= 0; i
< context
->n_bind_mounts
; i
++) {
2810 BindMount
*item
= context
->bind_mounts
+ i
;
2811 _cleanup_free_
char *s
= NULL
, *d
= NULL
;
2813 s
= strdup(item
->source
);
2817 d
= strdup(item
->destination
);
2821 bind_mounts
[h
++] = (BindMount
) {
2822 .source
= TAKE_PTR(s
),
2823 .destination
= TAKE_PTR(d
),
2824 .read_only
= item
->read_only
,
2825 .recursive
= item
->recursive
,
2826 .ignore_enoent
= item
->ignore_enoent
,
2830 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2831 if (!params
->prefix
[t
])
2834 if (context
->directories
[t
].n_items
== 0)
2837 if (exec_directory_is_private(context
, t
) &&
2838 !exec_context_with_rootfs(context
)) {
2841 /* So this is for a dynamic user, and we need to make sure the process can access its own
2842 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2843 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2845 private_root
= path_join(params
->prefix
[t
], "private");
2849 r
= strv_consume(&empty_directories
, private_root
);
2854 for (size_t i
= 0; i
< context
->directories
[t
].n_items
; i
++) {
2855 _cleanup_free_
char *s
= NULL
, *d
= NULL
;
2857 /* When one of the parent directories is in the list, we cannot create the symlink
2858 * for the child directory. See also the comments in setup_exec_directory(). */
2859 if (context
->directories
[t
].items
[i
].only_create
)
2862 if (exec_directory_is_private(context
, t
))
2863 s
= path_join(params
->prefix
[t
], "private", context
->directories
[t
].items
[i
].path
);
2865 s
= path_join(params
->prefix
[t
], context
->directories
[t
].items
[i
].path
);
2869 if (exec_directory_is_private(context
, t
) &&
2870 exec_context_with_rootfs(context
))
2871 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2872 * directory is not created on the root directory. So, let's bind-mount the directory
2873 * on the 'non-private' place. */
2874 d
= path_join(params
->prefix
[t
], context
->directories
[t
].items
[i
].path
);
2880 bind_mounts
[h
++] = (BindMount
) {
2881 .source
= TAKE_PTR(s
),
2882 .destination
= TAKE_PTR(d
),
2884 .nosuid
= context
->dynamic_user
, /* don't allow suid/sgid when DynamicUser= is on */
2886 .ignore_enoent
= false,
2893 *ret_bind_mounts
= TAKE_PTR(bind_mounts
);
2894 *ret_n_bind_mounts
= n
;
2895 *ret_empty_directories
= TAKE_PTR(empty_directories
);
2900 /* ret_symlinks will contain a list of pairs src:dest that describes
2901 * the symlinks to create later on. For example, the symlinks needed
2902 * to safely give private directories to DynamicUser=1 users. */
2903 static int compile_symlinks(
2904 const ExecContext
*context
,
2905 const ExecParameters
*params
,
2906 bool setup_os_release_symlink
,
2907 char ***ret_symlinks
) {
2909 _cleanup_strv_free_
char **symlinks
= NULL
;
2914 assert(ret_symlinks
);
2916 for (ExecDirectoryType dt
= 0; dt
< _EXEC_DIRECTORY_TYPE_MAX
; dt
++) {
2917 for (size_t i
= 0; i
< context
->directories
[dt
].n_items
; i
++) {
2918 _cleanup_free_
char *private_path
= NULL
, *path
= NULL
;
2920 STRV_FOREACH(symlink
, context
->directories
[dt
].items
[i
].symlinks
) {
2921 _cleanup_free_
char *src_abs
= NULL
, *dst_abs
= NULL
;
2923 src_abs
= path_join(params
->prefix
[dt
], context
->directories
[dt
].items
[i
].path
);
2924 dst_abs
= path_join(params
->prefix
[dt
], *symlink
);
2925 if (!src_abs
|| !dst_abs
)
2928 r
= strv_consume_pair(&symlinks
, TAKE_PTR(src_abs
), TAKE_PTR(dst_abs
));
2933 if (!exec_directory_is_private(context
, dt
) ||
2934 exec_context_with_rootfs(context
) ||
2935 context
->directories
[dt
].items
[i
].only_create
)
2938 private_path
= path_join(params
->prefix
[dt
], "private", context
->directories
[dt
].items
[i
].path
);
2942 path
= path_join(params
->prefix
[dt
], context
->directories
[dt
].items
[i
].path
);
2946 r
= strv_consume_pair(&symlinks
, TAKE_PTR(private_path
), TAKE_PTR(path
));
2952 /* We make the host's os-release available via a symlink, so that we can copy it atomically
2953 * and readers will never get a half-written version. Note that, while the paths specified here are
2954 * absolute, when they are processed in namespace.c they will be made relative automatically, i.e.:
2955 * 'os-release -> .os-release-stage/os-release' is what will be created. */
2956 if (setup_os_release_symlink
) {
2957 r
= strv_extend(&symlinks
, "/run/host/.os-release-stage/os-release");
2961 r
= strv_extend(&symlinks
, "/run/host/os-release");
2966 *ret_symlinks
= TAKE_PTR(symlinks
);
2971 static bool insist_on_sandboxing(
2972 const ExecContext
*context
,
2973 const char *root_dir
,
2974 const char *root_image
,
2975 const BindMount
*bind_mounts
,
2976 size_t n_bind_mounts
) {
2979 assert(n_bind_mounts
== 0 || bind_mounts
);
2981 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
2982 * would alter the view on the file system beyond making things read-only or invisible, i.e. would
2983 * rearrange stuff in a way we cannot ignore gracefully. */
2985 if (context
->n_temporary_filesystems
> 0)
2988 if (root_dir
|| root_image
)
2991 if (context
->n_mount_images
> 0)
2994 if (context
->dynamic_user
)
2997 if (context
->n_extension_images
> 0 || !strv_isempty(context
->extension_directories
))
3000 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3002 for (size_t i
= 0; i
< n_bind_mounts
; i
++)
3003 if (!path_equal(bind_mounts
[i
].source
, bind_mounts
[i
].destination
))
3006 if (context
->log_namespace
)
3012 static int setup_ephemeral(const ExecContext
*context
, ExecRuntime
*runtime
) {
3013 _cleanup_close_
int fd
= -EBADF
;
3016 if (!runtime
|| !runtime
->ephemeral_copy
)
3019 r
= posix_lock(runtime
->ephemeral_storage_socket
[0], LOCK_EX
);
3021 return log_debug_errno(r
, "Failed to lock ephemeral storage socket: %m");
3023 CLEANUP_POSIX_UNLOCK(runtime
->ephemeral_storage_socket
[0]);
3025 fd
= receive_one_fd(runtime
->ephemeral_storage_socket
[0], MSG_PEEK
|MSG_DONTWAIT
);
3027 /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */
3031 return log_debug_errno(fd
, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
3033 log_debug("Making ephemeral snapshot of %s to %s",
3034 context
->root_image
?: context
->root_directory
, runtime
->ephemeral_copy
);
3036 if (context
->root_image
)
3037 fd
= copy_file(context
->root_image
, runtime
->ephemeral_copy
, O_EXCL
, 0600,
3038 COPY_LOCK_BSD
|COPY_REFLINK
|COPY_CRTIME
);
3040 fd
= btrfs_subvol_snapshot_at(AT_FDCWD
, context
->root_directory
,
3041 AT_FDCWD
, runtime
->ephemeral_copy
,
3042 BTRFS_SNAPSHOT_FALLBACK_COPY
|
3043 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY
|
3044 BTRFS_SNAPSHOT_RECURSIVE
|
3045 BTRFS_SNAPSHOT_LOCK_BSD
);
3047 return log_debug_errno(fd
, "Failed to snapshot %s to %s: %m",
3048 context
->root_image
?: context
->root_directory
, runtime
->ephemeral_copy
);
3050 if (context
->root_image
) {
3051 /* A root image might be subject to lots of random writes so let's try to disable COW on it
3052 * which tends to not perform well in combination with lots of random writes.
3054 * Note: btrfs actually isn't impressed by us setting the flag after making the reflink'ed
3055 * copy, but we at least want to make the intention clear.
3057 r
= chattr_fd(fd
, FS_NOCOW_FL
, FS_NOCOW_FL
, NULL
);
3059 log_debug_errno(fd
, "Failed to disable copy-on-write for %s, ignoring: %m", runtime
->ephemeral_copy
);
3062 r
= send_one_fd(runtime
->ephemeral_storage_socket
[1], fd
, MSG_DONTWAIT
);
3064 return log_debug_errno(r
, "Failed to queue file descriptor on ephemeral storage socket: %m");
3069 static int verity_settings_prepare(
3070 VeritySettings
*verity
,
3071 const char *root_image
,
3072 const void *root_hash
,
3073 size_t root_hash_size
,
3074 const char *root_hash_path
,
3075 const void *root_hash_sig
,
3076 size_t root_hash_sig_size
,
3077 const char *root_hash_sig_path
,
3078 const char *verity_data_path
) {
3087 d
= memdup(root_hash
, root_hash_size
);
3091 free_and_replace(verity
->root_hash
, d
);
3092 verity
->root_hash_size
= root_hash_size
;
3093 verity
->designator
= PARTITION_ROOT
;
3096 if (root_hash_sig
) {
3099 d
= memdup(root_hash_sig
, root_hash_sig_size
);
3103 free_and_replace(verity
->root_hash_sig
, d
);
3104 verity
->root_hash_sig_size
= root_hash_sig_size
;
3105 verity
->designator
= PARTITION_ROOT
;
3108 if (verity_data_path
) {
3109 r
= free_and_strdup(&verity
->data_path
, verity_data_path
);
3114 r
= verity_settings_load(
3118 root_hash_sig_path
);
3120 return log_debug_errno(r
, "Failed to load root hash: %m");
3125 static int apply_mount_namespace(
3127 ExecCommandFlags command_flags
,
3128 const ExecContext
*context
,
3129 const ExecParameters
*params
,
3130 ExecRuntime
*runtime
,
3131 const char *memory_pressure_path
,
3132 char **error_path
) {
3134 _cleanup_(verity_settings_done
) VeritySettings verity
= VERITY_SETTINGS_DEFAULT
;
3135 _cleanup_strv_free_
char **empty_directories
= NULL
, **symlinks
= NULL
,
3136 **read_write_paths_cleanup
= NULL
;
3137 _cleanup_free_
char *creds_path
= NULL
, *incoming_dir
= NULL
, *propagate_dir
= NULL
,
3138 *extension_dir
= NULL
, *host_os_release_stage
= NULL
;
3139 const char *root_dir
= NULL
, *root_image
= NULL
, *tmp_dir
= NULL
, *var_tmp_dir
= NULL
;
3140 char **read_write_paths
;
3141 bool needs_sandboxing
, setup_os_release_symlink
;
3142 BindMount
*bind_mounts
= NULL
;
3143 size_t n_bind_mounts
= 0;
3148 CLEANUP_ARRAY(bind_mounts
, n_bind_mounts
, bind_mount_free_many
);
3150 if (params
->flags
& EXEC_APPLY_CHROOT
) {
3151 r
= setup_ephemeral(context
, runtime
);
3155 if (context
->root_image
)
3156 root_image
= (runtime
? runtime
->ephemeral_copy
: NULL
) ?: context
->root_image
;
3158 root_dir
= (runtime
? runtime
->ephemeral_copy
: NULL
) ?: context
->root_directory
;
3161 r
= compile_bind_mounts(context
, params
, &bind_mounts
, &n_bind_mounts
, &empty_directories
);
3165 /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
3166 * service will need to write to it in order to start the notifications. */
3167 if (context
->protect_control_groups
&& memory_pressure_path
&& !streq(memory_pressure_path
, "/dev/null")) {
3168 read_write_paths_cleanup
= strv_copy(context
->read_write_paths
);
3169 if (!read_write_paths_cleanup
)
3172 r
= strv_extend(&read_write_paths_cleanup
, memory_pressure_path
);
3176 read_write_paths
= read_write_paths_cleanup
;
3178 read_write_paths
= context
->read_write_paths
;
3180 needs_sandboxing
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command_flags
& EXEC_COMMAND_FULLY_PRIVILEGED
);
3181 if (needs_sandboxing
) {
3182 /* The runtime struct only contains the parent of the private /tmp, which is non-accessible
3183 * to world users. Inside of it there's a /tmp that is sticky, and that's the one we want to
3184 * use here. This does not apply when we are using /run/systemd/empty as fallback. */
3186 if (context
->private_tmp
&& runtime
&& runtime
->shared
) {
3187 if (streq_ptr(runtime
->shared
->tmp_dir
, RUN_SYSTEMD_EMPTY
))
3188 tmp_dir
= runtime
->shared
->tmp_dir
;
3189 else if (runtime
->shared
->tmp_dir
)
3190 tmp_dir
= strjoina(runtime
->shared
->tmp_dir
, "/tmp");
3192 if (streq_ptr(runtime
->shared
->var_tmp_dir
, RUN_SYSTEMD_EMPTY
))
3193 var_tmp_dir
= runtime
->shared
->var_tmp_dir
;
3194 else if (runtime
->shared
->var_tmp_dir
)
3195 var_tmp_dir
= strjoina(runtime
->shared
->var_tmp_dir
, "/tmp");
3199 /* Symlinks (exec dirs, os-release) are set up after other mounts, before they are made read-only. */
3200 setup_os_release_symlink
= needs_sandboxing
&& exec_context_get_effective_mount_apivfs(context
) && (root_dir
|| root_image
);
3201 r
= compile_symlinks(context
, params
, setup_os_release_symlink
, &symlinks
);
3205 if (context
->mount_propagation_flag
== MS_SHARED
)
3206 log_unit_debug(u
, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3208 if (FLAGS_SET(params
->flags
, EXEC_WRITE_CREDENTIALS
)) {
3209 r
= exec_context_get_credential_directory(context
, params
, u
->id
, &creds_path
);
3214 if (params
->runtime_scope
== RUNTIME_SCOPE_SYSTEM
) {
3215 propagate_dir
= path_join("/run/systemd/propagate/", u
->id
);
3219 incoming_dir
= strdup("/run/systemd/incoming");
3223 extension_dir
= strdup("/run/systemd/unit-extensions");
3227 /* If running under a different root filesystem, propagate the host's os-release. We make a
3228 * copy rather than just bind mounting it, so that it can be updated on soft-reboot. */
3229 if (setup_os_release_symlink
) {
3230 host_os_release_stage
= strdup("/run/systemd/propagate/.os-release-stage");
3231 if (!host_os_release_stage
)
3235 assert(params
->runtime_scope
== RUNTIME_SCOPE_USER
);
3237 if (asprintf(&extension_dir
, "/run/user/" UID_FMT
"/systemd/unit-extensions", geteuid()) < 0)
3240 if (setup_os_release_symlink
) {
3241 if (asprintf(&host_os_release_stage
,
3242 "/run/user/" UID_FMT
"/systemd/propagate/.os-release-stage",
3249 r
= verity_settings_prepare(
3252 context
->root_hash
, context
->root_hash_size
, context
->root_hash_path
,
3253 context
->root_hash_sig
, context
->root_hash_sig_size
, context
->root_hash_sig_path
,
3254 context
->root_verity
);
3259 NamespaceParameters parameters
= {
3260 .runtime_scope
= params
->runtime_scope
,
3262 .root_directory
= root_dir
,
3263 .root_image
= root_image
,
3264 .root_image_options
= context
->root_image_options
,
3265 .root_image_policy
= context
->root_image_policy
?: &image_policy_service
,
3267 .read_write_paths
= read_write_paths
,
3268 .read_only_paths
= needs_sandboxing
? context
->read_only_paths
: NULL
,
3269 .inaccessible_paths
= needs_sandboxing
? context
->inaccessible_paths
: NULL
,
3271 .exec_paths
= needs_sandboxing
? context
->exec_paths
: NULL
,
3272 .no_exec_paths
= needs_sandboxing
? context
->no_exec_paths
: NULL
,
3274 .empty_directories
= empty_directories
,
3275 .symlinks
= symlinks
,
3277 .bind_mounts
= bind_mounts
,
3278 .n_bind_mounts
= n_bind_mounts
,
3280 .temporary_filesystems
= context
->temporary_filesystems
,
3281 .n_temporary_filesystems
= context
->n_temporary_filesystems
,
3283 .mount_images
= context
->mount_images
,
3284 .n_mount_images
= context
->n_mount_images
,
3285 .mount_image_policy
= context
->mount_image_policy
?: &image_policy_service
,
3288 .var_tmp_dir
= var_tmp_dir
,
3290 .creds_path
= creds_path
,
3291 .log_namespace
= context
->log_namespace
,
3292 .mount_propagation_flag
= context
->mount_propagation_flag
,
3296 .extension_images
= context
->extension_images
,
3297 .n_extension_images
= context
->n_extension_images
,
3298 .extension_image_policy
= context
->extension_image_policy
?: &image_policy_sysext
,
3299 .extension_directories
= context
->extension_directories
,
3301 .propagate_dir
= propagate_dir
,
3302 .incoming_dir
= incoming_dir
,
3303 .extension_dir
= extension_dir
,
3304 .notify_socket
= root_dir
|| root_image
? params
->notify_socket
: NULL
,
3305 .host_os_release_stage
= host_os_release_stage
,
3307 /* If DynamicUser=no and RootDirectory= is set then lets pass a relaxed sandbox info,
3308 * otherwise enforce it, don't ignore protected paths and fail if we are enable to apply the
3309 * sandbox inside the mount namespace. */
3310 .ignore_protect_paths
= !needs_sandboxing
&& !context
->dynamic_user
&& root_dir
,
3312 .protect_control_groups
= needs_sandboxing
&& context
->protect_control_groups
,
3313 .protect_kernel_tunables
= needs_sandboxing
&& context
->protect_kernel_tunables
,
3314 .protect_kernel_modules
= needs_sandboxing
&& context
->protect_kernel_modules
,
3315 .protect_kernel_logs
= needs_sandboxing
&& context
->protect_kernel_logs
,
3316 .protect_hostname
= needs_sandboxing
&& context
->protect_hostname
,
3318 .private_dev
= needs_sandboxing
&& context
->private_devices
,
3319 .private_network
= needs_sandboxing
&& exec_needs_network_namespace(context
),
3320 .private_ipc
= needs_sandboxing
&& exec_needs_ipc_namespace(context
),
3322 .mount_apivfs
= needs_sandboxing
&& exec_context_get_effective_mount_apivfs(context
),
3324 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
3325 .mount_nosuid
= needs_sandboxing
&& context
->no_new_privileges
&& !mac_selinux_use(),
3327 .protect_home
= needs_sandboxing
&& context
->protect_home
,
3328 .protect_system
= needs_sandboxing
&& context
->protect_system
,
3329 .protect_proc
= needs_sandboxing
&& context
->protect_proc
,
3330 .proc_subset
= needs_sandboxing
&& context
->proc_subset
,
3333 r
= setup_namespace(¶meters
, error_path
);
3334 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
3335 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
3336 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3337 * completely different execution environment. */
3339 if (insist_on_sandboxing(
3341 root_dir
, root_image
,
3344 return log_unit_debug_errno(u
,
3345 SYNTHETIC_ERRNO(EOPNOTSUPP
),
3346 "Failed to set up namespace, and refusing to continue since "
3347 "the selected namespacing options alter mount environment non-trivially.\n"
3348 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3350 context
->n_temporary_filesystems
,
3353 yes_no(context
->dynamic_user
));
3355 log_unit_debug(u
, "Failed to set up namespace, assuming containerized execution and ignoring.");
3362 static int apply_working_directory(
3363 const ExecContext
*context
,
3364 const ExecParameters
*params
,
3365 ExecRuntime
*runtime
,
3372 assert(exit_status
);
3374 if (context
->working_directory_home
) {
3377 *exit_status
= EXIT_CHDIR
;
3384 wd
= empty_to_root(context
->working_directory
);
3386 if (params
->flags
& EXEC_APPLY_CHROOT
)
3389 d
= prefix_roota((runtime
? runtime
->ephemeral_copy
: NULL
) ?: context
->root_directory
, wd
);
3391 if (chdir(d
) < 0 && !context
->working_directory_missing_ok
) {
3392 *exit_status
= EXIT_CHDIR
;
3399 static int apply_root_directory(
3400 const ExecContext
*context
,
3401 const ExecParameters
*params
,
3402 ExecRuntime
*runtime
,
3403 const bool needs_mount_ns
,
3407 assert(exit_status
);
3409 if (params
->flags
& EXEC_APPLY_CHROOT
)
3410 if (!needs_mount_ns
&& context
->root_directory
)
3411 if (chroot((runtime
? runtime
->ephemeral_copy
: NULL
) ?: context
->root_directory
) < 0) {
3412 *exit_status
= EXIT_CHROOT
;
3419 static int setup_keyring(
3421 const ExecContext
*context
,
3422 const ExecParameters
*p
,
3423 uid_t uid
, gid_t gid
) {
3425 key_serial_t keyring
;
3434 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3435 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3436 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3437 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3438 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3439 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3441 if (context
->keyring_mode
== EXEC_KEYRING_INHERIT
)
3444 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3445 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3446 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3447 * & group is just as nasty as acquiring a reference to the user keyring. */
3449 saved_uid
= getuid();
3450 saved_gid
= getgid();
3452 if (gid_is_valid(gid
) && gid
!= saved_gid
) {
3453 if (setregid(gid
, -1) < 0)
3454 return log_unit_error_errno(u
, errno
, "Failed to change GID for user keyring: %m");
3457 if (uid_is_valid(uid
) && uid
!= saved_uid
) {
3458 if (setreuid(uid
, -1) < 0) {
3459 r
= log_unit_error_errno(u
, errno
, "Failed to change UID for user keyring: %m");
3464 keyring
= keyctl(KEYCTL_JOIN_SESSION_KEYRING
, 0, 0, 0, 0);
3465 if (keyring
== -1) {
3466 if (errno
== ENOSYS
)
3467 log_unit_debug_errno(u
, errno
, "Kernel keyring not supported, ignoring.");
3468 else if (ERRNO_IS_PRIVILEGE(errno
))
3469 log_unit_debug_errno(u
, errno
, "Kernel keyring access prohibited, ignoring.");
3470 else if (errno
== EDQUOT
)
3471 log_unit_debug_errno(u
, errno
, "Out of kernel keyrings to allocate, ignoring.");
3473 r
= log_unit_error_errno(u
, errno
, "Setting up kernel keyring failed: %m");
3478 /* When requested link the user keyring into the session keyring. */
3479 if (context
->keyring_mode
== EXEC_KEYRING_SHARED
) {
3481 if (keyctl(KEYCTL_LINK
,
3482 KEY_SPEC_USER_KEYRING
,
3483 KEY_SPEC_SESSION_KEYRING
, 0, 0) < 0) {
3484 r
= log_unit_error_errno(u
, errno
, "Failed to link user keyring into session keyring: %m");
3489 /* Restore uid/gid back */
3490 if (uid_is_valid(uid
) && uid
!= saved_uid
) {
3491 if (setreuid(saved_uid
, -1) < 0) {
3492 r
= log_unit_error_errno(u
, errno
, "Failed to change UID back for user keyring: %m");
3497 if (gid_is_valid(gid
) && gid
!= saved_gid
) {
3498 if (setregid(saved_gid
, -1) < 0)
3499 return log_unit_error_errno(u
, errno
, "Failed to change GID back for user keyring: %m");
3502 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
3503 if (!sd_id128_is_null(u
->invocation_id
)) {
3506 key
= add_key("user", "invocation_id", &u
->invocation_id
, sizeof(u
->invocation_id
), KEY_SPEC_SESSION_KEYRING
);
3508 log_unit_debug_errno(u
, errno
, "Failed to add invocation ID to keyring, ignoring: %m");
3510 if (keyctl(KEYCTL_SETPERM
, key
,
3511 KEY_POS_VIEW
|KEY_POS_READ
|KEY_POS_SEARCH
|
3512 KEY_USR_VIEW
|KEY_USR_READ
|KEY_USR_SEARCH
, 0, 0) < 0)
3513 r
= log_unit_error_errno(u
, errno
, "Failed to restrict invocation ID permission: %m");
3518 /* Revert back uid & gid for the last time, and exit */
3519 /* no extra logging, as only the first already reported error matters */
3520 if (getuid() != saved_uid
)
3521 (void) setreuid(saved_uid
, -1);
3523 if (getgid() != saved_gid
)
3524 (void) setregid(saved_gid
, -1);
3529 static void append_socket_pair(int *array
, size_t *n
, const int pair
[static 2]) {
3535 array
[(*n
)++] = pair
[0];
3537 array
[(*n
)++] = pair
[1];
3540 static int close_remaining_fds(
3541 const ExecParameters
*params
,
3542 const ExecRuntime
*runtime
,
3545 const int *fds
, size_t n_fds
) {
3547 size_t n_dont_close
= 0;
3548 int dont_close
[n_fds
+ 14];
3552 if (params
->stdin_fd
>= 0)
3553 dont_close
[n_dont_close
++] = params
->stdin_fd
;
3554 if (params
->stdout_fd
>= 0)
3555 dont_close
[n_dont_close
++] = params
->stdout_fd
;
3556 if (params
->stderr_fd
>= 0)
3557 dont_close
[n_dont_close
++] = params
->stderr_fd
;
3560 dont_close
[n_dont_close
++] = socket_fd
;
3562 memcpy(dont_close
+ n_dont_close
, fds
, sizeof(int) * n_fds
);
3563 n_dont_close
+= n_fds
;
3567 append_socket_pair(dont_close
, &n_dont_close
, runtime
->ephemeral_storage_socket
);
3569 if (runtime
&& runtime
->shared
) {
3570 append_socket_pair(dont_close
, &n_dont_close
, runtime
->shared
->netns_storage_socket
);
3571 append_socket_pair(dont_close
, &n_dont_close
, runtime
->shared
->ipcns_storage_socket
);
3574 if (runtime
&& runtime
->dynamic_creds
) {
3575 if (runtime
->dynamic_creds
->user
)
3576 append_socket_pair(dont_close
, &n_dont_close
, runtime
->dynamic_creds
->user
->storage_socket
);
3577 if (runtime
->dynamic_creds
->group
)
3578 append_socket_pair(dont_close
, &n_dont_close
, runtime
->dynamic_creds
->group
->storage_socket
);
3581 if (user_lookup_fd
>= 0)
3582 dont_close
[n_dont_close
++] = user_lookup_fd
;
3584 return close_all_fds(dont_close
, n_dont_close
);
3587 static int send_user_lookup(
3595 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3596 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3599 if (user_lookup_fd
< 0)
3602 if (!uid_is_valid(uid
) && !gid_is_valid(gid
))
3605 if (writev(user_lookup_fd
,
3607 IOVEC_MAKE(&uid
, sizeof(uid
)),
3608 IOVEC_MAKE(&gid
, sizeof(gid
)),
3609 IOVEC_MAKE_STRING(unit
->id
) }, 3) < 0)
3615 static int acquire_home(const ExecContext
*c
, uid_t uid
, const char** home
, char **buf
) {
3622 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3627 if (!c
->working_directory_home
)
3630 r
= get_home_dir(buf
);
3638 static int compile_suggested_paths(const ExecContext
*c
, const ExecParameters
*p
, char ***ret
) {
3639 _cleanup_strv_free_
char ** list
= NULL
;
3646 assert(c
->dynamic_user
);
3648 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3649 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3652 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
3653 if (t
== EXEC_DIRECTORY_CONFIGURATION
)
3659 for (size_t i
= 0; i
< c
->directories
[t
].n_items
; i
++) {
3662 if (exec_directory_is_private(c
, t
))
3663 e
= path_join(p
->prefix
[t
], "private", c
->directories
[t
].items
[i
].path
);
3665 e
= path_join(p
->prefix
[t
], c
->directories
[t
].items
[i
].path
);
3669 r
= strv_consume(&list
, e
);
3675 *ret
= TAKE_PTR(list
);
3680 static int exec_parameters_get_cgroup_path(
3681 const ExecParameters
*params
,
3682 const CGroupContext
*c
,
3685 const char *subgroup
= NULL
;
3691 if (!params
->cgroup_path
)
3694 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
3695 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
3696 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
3697 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
3698 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
3699 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
3700 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
3701 * flag, which is only passed for the former statements, not for the latter. */
3703 if (FLAGS_SET(params
->flags
, EXEC_CGROUP_DELEGATE
) && (FLAGS_SET(params
->flags
, EXEC_CONTROL_CGROUP
) || c
->delegate_subgroup
)) {
3704 if (FLAGS_SET(params
->flags
, EXEC_IS_CONTROL
))
3705 subgroup
= ".control";
3707 subgroup
= c
->delegate_subgroup
;
3711 p
= path_join(params
->cgroup_path
, subgroup
);
3713 p
= strdup(params
->cgroup_path
);
3721 static int exec_context_cpu_affinity_from_numa(const ExecContext
*c
, CPUSet
*ret
) {
3722 _cleanup_(cpu_set_reset
) CPUSet s
= {};
3728 if (!c
->numa_policy
.nodes
.set
) {
3729 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3733 r
= numa_to_cpu_set(&c
->numa_policy
, &s
);
3739 return cpu_set_add_all(ret
, &s
);
3742 bool exec_context_get_cpu_affinity_from_numa(const ExecContext
*c
) {
3745 return c
->cpu_affinity_from_numa
;
3748 static int add_shifted_fd(int *fds
, size_t fds_size
, size_t *n_fds
, int fd
, int *ret_fd
) {
3753 assert(*n_fds
< fds_size
);
3761 if (fd
< 3 + (int) *n_fds
) {
3762 /* Let's move the fd up, so that it's outside of the fd range we will use to store
3763 * the fds we pass to the process (or which are closed only during execve). */
3765 r
= fcntl(fd
, F_DUPFD_CLOEXEC
, 3 + (int) *n_fds
);
3769 close_and_replace(fd
, r
);
3772 *ret_fd
= fds
[*n_fds
] = fd
;
3777 static int connect_unix_harder(Unit
*u
, const OpenFile
*of
, int ofd
) {
3778 union sockaddr_union addr
= {
3779 .un
.sun_family
= AF_UNIX
,
3782 static const int socket_types
[] = { SOCK_DGRAM
, SOCK_STREAM
, SOCK_SEQPACKET
};
3789 r
= sockaddr_un_set_path(&addr
.un
, FORMAT_PROC_FD_PATH(ofd
));
3791 return log_unit_error_errno(u
, r
, "Failed to set sockaddr for %s: %m", of
->path
);
3795 for (size_t i
= 0; i
< ELEMENTSOF(socket_types
); i
++) {
3796 _cleanup_close_
int fd
= -EBADF
;
3798 fd
= socket(AF_UNIX
, socket_types
[i
] | SOCK_CLOEXEC
, 0);
3800 return log_unit_error_errno(u
, errno
, "Failed to create socket for %s: %m", of
->path
);
3802 r
= RET_NERRNO(connect(fd
, &addr
.sa
, sa_len
));
3803 if (r
== -EPROTOTYPE
)
3806 return log_unit_error_errno(u
, r
, "Failed to connect socket for %s: %m", of
->path
);
3811 return log_unit_error_errno(u
, SYNTHETIC_ERRNO(EPROTOTYPE
), "Failed to connect socket for \"%s\".", of
->path
);
3814 static int get_open_file_fd(Unit
*u
, const OpenFile
*of
) {
3816 _cleanup_close_
int fd
= -EBADF
, ofd
= -EBADF
;
3821 ofd
= open(of
->path
, O_PATH
| O_CLOEXEC
);
3823 return log_unit_error_errno(u
, errno
, "Could not open \"%s\": %m", of
->path
);
3825 if (fstat(ofd
, &st
) < 0)
3826 return log_unit_error_errno(u
, errno
, "Failed to stat %s: %m", of
->path
);
3828 if (S_ISSOCK(st
.st_mode
)) {
3829 fd
= connect_unix_harder(u
, of
, ofd
);
3833 if (FLAGS_SET(of
->flags
, OPENFILE_READ_ONLY
) && shutdown(fd
, SHUT_WR
) < 0)
3834 return log_unit_error_errno(u
, errno
, "Failed to shutdown send for socket %s: %m",
3837 log_unit_debug(u
, "socket %s opened (fd=%d)", of
->path
, fd
);
3839 int flags
= FLAGS_SET(of
->flags
, OPENFILE_READ_ONLY
) ? O_RDONLY
: O_RDWR
;
3840 if (FLAGS_SET(of
->flags
, OPENFILE_APPEND
))
3842 else if (FLAGS_SET(of
->flags
, OPENFILE_TRUNCATE
))
3845 fd
= fd_reopen(ofd
, flags
| O_CLOEXEC
);
3847 return log_unit_error_errno(u
, fd
, "Failed to open file %s: %m", of
->path
);
3849 log_unit_debug(u
, "file %s opened (fd=%d)", of
->path
, fd
);
3855 static int collect_open_file_fds(
3857 OpenFile
* open_files
,
3868 LIST_FOREACH(open_files
, of
, open_files
) {
3869 _cleanup_close_
int fd
= -EBADF
;
3871 fd
= get_open_file_fd(u
, of
);
3873 if (FLAGS_SET(of
->flags
, OPENFILE_GRACEFUL
)) {
3874 log_unit_debug_errno(u
, fd
, "Failed to get OpenFile= file descriptor for %s, ignoring: %m", of
->path
);
3881 if (!GREEDY_REALLOC(*fds
, *n_fds
+ 1))
3884 r
= strv_extend(fdnames
, of
->fdname
);
3888 (*fds
)[*n_fds
] = TAKE_FD(fd
);
3896 static void log_command_line(Unit
*unit
, const char *msg
, const char *executable
, char **argv
) {
3904 _cleanup_free_
char *cmdline
= quote_command_line(argv
, SHELL_ESCAPE_EMPTY
);
3906 log_unit_struct(unit
, LOG_DEBUG
,
3907 "EXECUTABLE=%s", executable
,
3908 LOG_UNIT_MESSAGE(unit
, "%s: %s", msg
, strnull(cmdline
)),
3909 LOG_UNIT_INVOCATION_ID(unit
));
3912 static bool exec_context_need_unprivileged_private_users(
3913 const ExecContext
*context
,
3914 const ExecParameters
*params
) {
3919 /* These options require PrivateUsers= when used in user units, as we need to be in a user namespace
3920 * to have permission to enable them when not running as root. If we have effective CAP_SYS_ADMIN
3921 * (system manager) then we have privileges and don't need this. */
3922 if (params
->runtime_scope
!= RUNTIME_SCOPE_USER
)
3925 return context
->private_users
||
3926 context
->private_tmp
||
3927 context
->private_devices
||
3928 context
->private_network
||
3929 context
->network_namespace_path
||
3930 context
->private_ipc
||
3931 context
->ipc_namespace_path
||
3932 context
->private_mounts
> 0 ||
3933 context
->mount_apivfs
||
3934 context
->n_bind_mounts
> 0 ||
3935 context
->n_temporary_filesystems
> 0 ||
3936 context
->root_directory
||
3937 !strv_isempty(context
->extension_directories
) ||
3938 context
->protect_system
!= PROTECT_SYSTEM_NO
||
3939 context
->protect_home
!= PROTECT_HOME_NO
||
3940 context
->protect_kernel_tunables
||
3941 context
->protect_kernel_modules
||
3942 context
->protect_kernel_logs
||
3943 context
->protect_control_groups
||
3944 context
->protect_clock
||
3945 context
->protect_hostname
||
3946 !strv_isempty(context
->read_write_paths
) ||
3947 !strv_isempty(context
->read_only_paths
) ||
3948 !strv_isempty(context
->inaccessible_paths
) ||
3949 !strv_isempty(context
->exec_paths
) ||
3950 !strv_isempty(context
->no_exec_paths
);
3953 static int exec_child(
3955 const ExecCommand
*command
,
3956 const ExecContext
*context
,
3957 const ExecParameters
*params
,
3958 ExecRuntime
*runtime
,
3959 const CGroupContext
*cgroup_context
,
3961 const int named_iofds
[static 3],
3963 size_t n_socket_fds
,
3964 size_t n_storage_fds
,
3969 _cleanup_strv_free_
char **our_env
= NULL
, **pass_env
= NULL
, **joined_exec_search_path
= NULL
, **accum_env
= NULL
, **replaced_argv
= NULL
;
3970 int r
, ngids
= 0, exec_fd
;
3971 _cleanup_free_ gid_t
*supplementary_gids
= NULL
;
3972 const char *username
= NULL
, *groupname
= NULL
;
3973 _cleanup_free_
char *home_buffer
= NULL
, *memory_pressure_path
= NULL
;
3974 const char *home
= NULL
, *shell
= NULL
;
3975 char **final_argv
= NULL
;
3976 dev_t journal_stream_dev
= 0;
3977 ino_t journal_stream_ino
= 0;
3978 bool userns_set_up
= false;
3979 bool needs_sandboxing
, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
3980 needs_setuid
, /* Do we need to do the actual setresuid()/setresgid() calls? */
3981 needs_mount_namespace
, /* Do we need to set up a mount namespace for this kernel? */
3982 needs_ambient_hack
; /* Do we need to apply the ambient capabilities hack? */
3984 _cleanup_free_
char *mac_selinux_context_net
= NULL
;
3985 bool use_selinux
= false;
3988 bool use_smack
= false;
3991 bool use_apparmor
= false;
3993 uid_t saved_uid
= getuid();
3994 gid_t saved_gid
= getgid();
3995 uid_t uid
= UID_INVALID
;
3996 gid_t gid
= GID_INVALID
;
3997 size_t n_fds
= n_socket_fds
+ n_storage_fds
, /* fds to pass to the child */
3998 n_keep_fds
; /* total number of fds not to close */
4000 _cleanup_free_ gid_t
*gids_after_pam
= NULL
;
4001 int ngids_after_pam
= 0;
4002 _cleanup_free_
int *fds
= NULL
;
4003 _cleanup_strv_free_
char **fdnames
= NULL
;
4009 assert(exit_status
);
4011 /* Explicitly test for CVE-2021-4034 inspired invocations */
4012 assert(command
->path
);
4013 assert(!strv_isempty(command
->argv
));
4015 rename_process_from_path(command
->path
);
4017 /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4018 * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4019 * both of which will be demoted to SIG_DFL. */
4020 (void) default_signals(SIGNALS_CRASH_HANDLER
,
4023 if (context
->ignore_sigpipe
)
4024 (void) ignore_signals(SIGPIPE
);
4026 r
= reset_signal_mask();
4028 *exit_status
= EXIT_SIGNAL_MASK
;
4029 return log_unit_error_errno(unit
, r
, "Failed to set process signal mask: %m");
4032 if (params
->idle_pipe
)
4033 do_idle_pipe_dance(params
->idle_pipe
);
4035 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4036 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4037 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4038 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
4041 log_set_open_when_needed(true);
4042 log_settle_target();
4043 if (context
->log_level_max
>= 0)
4044 log_set_max_level(context
->log_level_max
);
4046 /* In case anything used libc syslog(), close this here, too */
4049 fds
= newdup(int, params_fds
, n_fds
);
4051 *exit_status
= EXIT_MEMORY
;
4055 fdnames
= strv_copy((char**) params
->fd_names
);
4057 *exit_status
= EXIT_MEMORY
;
4061 r
= collect_open_file_fds(unit
, params
->open_files
, &fds
, &fdnames
, &n_fds
);
4063 *exit_status
= EXIT_FDS
;
4064 return log_unit_error_errno(unit
, r
, "Failed to get OpenFile= file descriptors: %m");
4067 int keep_fds
[n_fds
+ 3];
4068 memcpy_safe(keep_fds
, fds
, n_fds
* sizeof(int));
4071 r
= add_shifted_fd(keep_fds
, ELEMENTSOF(keep_fds
), &n_keep_fds
, params
->exec_fd
, &exec_fd
);
4073 *exit_status
= EXIT_FDS
;
4074 return log_unit_error_errno(unit
, r
, "Failed to shift fd and set FD_CLOEXEC: %m");
4078 if (unit
->manager
->restrict_fs
) {
4079 int bpf_map_fd
= lsm_bpf_map_restrict_fs_fd(unit
);
4080 if (bpf_map_fd
< 0) {
4081 *exit_status
= EXIT_FDS
;
4082 return log_unit_error_errno(unit
, bpf_map_fd
, "Failed to get restrict filesystems BPF map fd: %m");
4085 r
= add_shifted_fd(keep_fds
, ELEMENTSOF(keep_fds
), &n_keep_fds
, bpf_map_fd
, &bpf_map_fd
);
4087 *exit_status
= EXIT_FDS
;
4088 return log_unit_error_errno(unit
, r
, "Failed to shift fd and set FD_CLOEXEC: %m");
4093 r
= close_remaining_fds(params
, runtime
, user_lookup_fd
, socket_fd
, keep_fds
, n_keep_fds
);
4095 *exit_status
= EXIT_FDS
;
4096 return log_unit_error_errno(unit
, r
, "Failed to close unwanted file descriptors: %m");
4099 if (!context
->same_pgrp
&&
4101 *exit_status
= EXIT_SETSID
;
4102 return log_unit_error_errno(unit
, errno
, "Failed to create new process session: %m");
4105 exec_context_tty_reset(context
, params
);
4107 if (unit_shall_confirm_spawn(unit
)) {
4108 _cleanup_free_
char *cmdline
= NULL
;
4110 cmdline
= quote_command_line(command
->argv
, SHELL_ESCAPE_EMPTY
);
4112 *exit_status
= EXIT_MEMORY
;
4116 r
= ask_for_confirmation(context
, params
->confirm_spawn
, unit
, cmdline
);
4117 if (r
!= CONFIRM_EXECUTE
) {
4118 if (r
== CONFIRM_PRETEND_SUCCESS
) {
4119 *exit_status
= EXIT_SUCCESS
;
4123 *exit_status
= EXIT_CONFIRM
;
4124 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(ECANCELED
),
4125 "Execution cancelled by the user");
4129 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4130 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4131 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4132 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4133 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4134 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit
->id
, true) != 0 ||
4135 setenv("SYSTEMD_ACTIVATION_SCOPE", runtime_scope_to_string(params
->runtime_scope
), true) != 0) {
4136 *exit_status
= EXIT_MEMORY
;
4137 return log_unit_error_errno(unit
, errno
, "Failed to update environment: %m");
4140 if (context
->dynamic_user
&& runtime
&& runtime
->dynamic_creds
) {
4141 _cleanup_strv_free_
char **suggested_paths
= NULL
;
4143 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
4144 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
4145 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4146 *exit_status
= EXIT_USER
;
4147 return log_unit_error_errno(unit
, errno
, "Failed to update environment: %m");
4150 r
= compile_suggested_paths(context
, params
, &suggested_paths
);
4152 *exit_status
= EXIT_MEMORY
;
4156 r
= dynamic_creds_realize(runtime
->dynamic_creds
, suggested_paths
, &uid
, &gid
);
4158 *exit_status
= EXIT_USER
;
4160 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(EOPNOTSUPP
),
4161 "Failed to update dynamic user credentials: User or group with specified name already exists.");
4162 return log_unit_error_errno(unit
, r
, "Failed to update dynamic user credentials: %m");
4165 if (!uid_is_valid(uid
)) {
4166 *exit_status
= EXIT_USER
;
4167 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(ESRCH
), "UID validation failed for \""UID_FMT
"\"", uid
);
4170 if (!gid_is_valid(gid
)) {
4171 *exit_status
= EXIT_USER
;
4172 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(ESRCH
), "GID validation failed for \""GID_FMT
"\"", gid
);
4175 if (runtime
->dynamic_creds
->user
)
4176 username
= runtime
->dynamic_creds
->user
->name
;
4179 if (context
->user
) {
4180 r
= get_fixed_user(context
->user
, &username
, &uid
, &gid
, &home
, &shell
);
4182 *exit_status
= EXIT_USER
;
4183 return log_unit_error_errno(unit
, r
, "Failed to determine user credentials: %m");
4187 if (context
->group
) {
4188 r
= get_fixed_group(context
->group
, &groupname
, &gid
);
4190 *exit_status
= EXIT_GROUP
;
4191 return log_unit_error_errno(unit
, r
, "Failed to determine group credentials: %m");
4196 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4197 r
= get_supplementary_groups(context
, username
, groupname
, gid
,
4198 &supplementary_gids
, &ngids
);
4200 *exit_status
= EXIT_GROUP
;
4201 return log_unit_error_errno(unit
, r
, "Failed to determine supplementary groups: %m");
4204 r
= send_user_lookup(unit
, user_lookup_fd
, uid
, gid
);
4206 *exit_status
= EXIT_USER
;
4207 return log_unit_error_errno(unit
, r
, "Failed to send user credentials to PID1: %m");
4210 user_lookup_fd
= safe_close(user_lookup_fd
);
4212 r
= acquire_home(context
, uid
, &home
, &home_buffer
);
4214 *exit_status
= EXIT_CHDIR
;
4215 return log_unit_error_errno(unit
, r
, "Failed to determine $HOME for user: %m");
4218 /* If a socket is connected to STDIN/STDOUT/STDERR, we must drop O_NONBLOCK */
4220 (void) fd_nonblock(socket_fd
, false);
4222 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4223 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4224 if (params
->cgroup_path
) {
4225 _cleanup_free_
char *p
= NULL
;
4227 r
= exec_parameters_get_cgroup_path(params
, cgroup_context
, &p
);
4229 *exit_status
= EXIT_CGROUP
;
4230 return log_unit_error_errno(unit
, r
, "Failed to acquire cgroup path: %m");
4233 r
= cg_attach_everywhere(params
->cgroup_supported
, p
, 0, NULL
, NULL
);
4234 if (r
== -EUCLEAN
) {
4235 *exit_status
= EXIT_CGROUP
;
4236 return log_unit_error_errno(unit
, r
, "Failed to attach process to cgroup %s "
4237 "because the cgroup or one of its parents or "
4238 "siblings is in the threaded mode: %m", p
);
4241 *exit_status
= EXIT_CGROUP
;
4242 return log_unit_error_errno(unit
, r
, "Failed to attach to cgroup %s: %m", p
);
4246 if (context
->network_namespace_path
&& runtime
&& runtime
->shared
&& runtime
->shared
->netns_storage_socket
[0] >= 0) {
4247 r
= open_shareable_ns_path(runtime
->shared
->netns_storage_socket
, context
->network_namespace_path
, CLONE_NEWNET
);
4249 *exit_status
= EXIT_NETWORK
;
4250 return log_unit_error_errno(unit
, r
, "Failed to open network namespace path %s: %m", context
->network_namespace_path
);
4254 if (context
->ipc_namespace_path
&& runtime
&& runtime
->shared
&& runtime
->shared
->ipcns_storage_socket
[0] >= 0) {
4255 r
= open_shareable_ns_path(runtime
->shared
->ipcns_storage_socket
, context
->ipc_namespace_path
, CLONE_NEWIPC
);
4257 *exit_status
= EXIT_NAMESPACE
;
4258 return log_unit_error_errno(unit
, r
, "Failed to open IPC namespace path %s: %m", context
->ipc_namespace_path
);
4262 r
= setup_input(context
, params
, socket_fd
, named_iofds
);
4264 *exit_status
= EXIT_STDIN
;
4265 return log_unit_error_errno(unit
, r
, "Failed to set up standard input: %m");
4268 r
= setup_output(unit
, context
, params
, STDOUT_FILENO
, socket_fd
, named_iofds
, basename(command
->path
), uid
, gid
, &journal_stream_dev
, &journal_stream_ino
);
4270 *exit_status
= EXIT_STDOUT
;
4271 return log_unit_error_errno(unit
, r
, "Failed to set up standard output: %m");
4274 r
= setup_output(unit
, context
, params
, STDERR_FILENO
, socket_fd
, named_iofds
, basename(command
->path
), uid
, gid
, &journal_stream_dev
, &journal_stream_ino
);
4276 *exit_status
= EXIT_STDERR
;
4277 return log_unit_error_errno(unit
, r
, "Failed to set up standard error output: %m");
4280 if (context
->oom_score_adjust_set
) {
4281 /* When we can't make this change due to EPERM, then let's silently skip over it. User
4282 * namespaces prohibit write access to this file, and we shouldn't trip up over that. */
4283 r
= set_oom_score_adjust(context
->oom_score_adjust
);
4284 if (ERRNO_IS_NEG_PRIVILEGE(r
))
4285 log_unit_debug_errno(unit
, r
,
4286 "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
4288 *exit_status
= EXIT_OOM_ADJUST
;
4289 return log_unit_error_errno(unit
, r
, "Failed to adjust OOM setting: %m");
4293 if (context
->coredump_filter_set
) {
4294 r
= set_coredump_filter(context
->coredump_filter
);
4295 if (ERRNO_IS_NEG_PRIVILEGE(r
))
4296 log_unit_debug_errno(unit
, r
, "Failed to adjust coredump_filter, ignoring: %m");
4298 *exit_status
= EXIT_LIMITS
;
4299 return log_unit_error_errno(unit
, r
, "Failed to adjust coredump_filter: %m");
4303 if (context
->nice_set
) {
4304 r
= setpriority_closest(context
->nice
);
4306 *exit_status
= EXIT_NICE
;
4307 return log_unit_error_errno(unit
, r
, "Failed to set up process scheduling priority (nice level): %m");
4311 if (context
->cpu_sched_set
) {
4312 struct sched_param param
= {
4313 .sched_priority
= context
->cpu_sched_priority
,
4316 r
= sched_setscheduler(0,
4317 context
->cpu_sched_policy
|
4318 (context
->cpu_sched_reset_on_fork
?
4319 SCHED_RESET_ON_FORK
: 0),
4322 *exit_status
= EXIT_SETSCHEDULER
;
4323 return log_unit_error_errno(unit
, errno
, "Failed to set up CPU scheduling: %m");
4327 if (context
->cpu_affinity_from_numa
|| context
->cpu_set
.set
) {
4328 _cleanup_(cpu_set_reset
) CPUSet converted_cpu_set
= {};
4329 const CPUSet
*cpu_set
;
4331 if (context
->cpu_affinity_from_numa
) {
4332 r
= exec_context_cpu_affinity_from_numa(context
, &converted_cpu_set
);
4334 *exit_status
= EXIT_CPUAFFINITY
;
4335 return log_unit_error_errno(unit
, r
, "Failed to derive CPU affinity mask from NUMA mask: %m");
4338 cpu_set
= &converted_cpu_set
;
4340 cpu_set
= &context
->cpu_set
;
4342 if (sched_setaffinity(0, cpu_set
->allocated
, cpu_set
->set
) < 0) {
4343 *exit_status
= EXIT_CPUAFFINITY
;
4344 return log_unit_error_errno(unit
, errno
, "Failed to set up CPU affinity: %m");
4348 if (mpol_is_valid(numa_policy_get_type(&context
->numa_policy
))) {
4349 r
= apply_numa_policy(&context
->numa_policy
);
4350 if (ERRNO_IS_NEG_NOT_SUPPORTED(r
))
4351 log_unit_debug_errno(unit
, r
, "NUMA support not available, ignoring.");
4353 *exit_status
= EXIT_NUMA_POLICY
;
4354 return log_unit_error_errno(unit
, r
, "Failed to set NUMA memory policy: %m");
4358 if (context
->ioprio_set
)
4359 if (ioprio_set(IOPRIO_WHO_PROCESS
, 0, context
->ioprio
) < 0) {
4360 *exit_status
= EXIT_IOPRIO
;
4361 return log_unit_error_errno(unit
, errno
, "Failed to set up IO scheduling priority: %m");
4364 if (context
->timer_slack_nsec
!= NSEC_INFINITY
)
4365 if (prctl(PR_SET_TIMERSLACK
, context
->timer_slack_nsec
) < 0) {
4366 *exit_status
= EXIT_TIMERSLACK
;
4367 return log_unit_error_errno(unit
, errno
, "Failed to set up timer slack: %m");
4370 if (context
->personality
!= PERSONALITY_INVALID
) {
4371 r
= safe_personality(context
->personality
);
4373 *exit_status
= EXIT_PERSONALITY
;
4374 return log_unit_error_errno(unit
, r
, "Failed to set up execution domain (personality): %m");
4378 if (context
->utmp_id
) {
4379 const char *line
= context
->tty_path
?
4380 (path_startswith(context
->tty_path
, "/dev/") ?: context
->tty_path
) :
4382 utmp_put_init_process(context
->utmp_id
, getpid_cached(), getsid(0),
4384 context
->utmp_mode
== EXEC_UTMP_INIT
? INIT_PROCESS
:
4385 context
->utmp_mode
== EXEC_UTMP_LOGIN
? LOGIN_PROCESS
:
4390 if (uid_is_valid(uid
)) {
4391 r
= chown_terminal(STDIN_FILENO
, uid
);
4393 *exit_status
= EXIT_STDIN
;
4394 return log_unit_error_errno(unit
, r
, "Failed to change ownership of terminal: %m");
4398 if (params
->cgroup_path
) {
4399 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
4400 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4401 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
4402 * touch a single hierarchy too. */
4404 if (params
->flags
& EXEC_CGROUP_DELEGATE
) {
4405 _cleanup_free_
char *p
= NULL
;
4407 r
= cg_set_access(SYSTEMD_CGROUP_CONTROLLER
, params
->cgroup_path
, uid
, gid
);
4409 *exit_status
= EXIT_CGROUP
;
4410 return log_unit_error_errno(unit
, r
, "Failed to adjust control group access: %m");
4413 r
= exec_parameters_get_cgroup_path(params
, cgroup_context
, &p
);
4415 *exit_status
= EXIT_CGROUP
;
4416 return log_unit_error_errno(unit
, r
, "Failed to acquire cgroup path: %m");
4419 r
= cg_set_access_recursive(SYSTEMD_CGROUP_CONTROLLER
, p
, uid
, gid
);
4421 *exit_status
= EXIT_CGROUP
;
4422 return log_unit_error_errno(unit
, r
, "Failed to adjust control subgroup access: %m");
4427 if (cgroup_context
&& cg_unified() > 0 && is_pressure_supported() > 0) {
4428 if (cgroup_context_want_memory_pressure(cgroup_context
)) {
4429 r
= cg_get_path("memory", params
->cgroup_path
, "memory.pressure", &memory_pressure_path
);
4431 *exit_status
= EXIT_MEMORY
;
4435 r
= chmod_and_chown(memory_pressure_path
, 0644, uid
, gid
);
4437 log_unit_full_errno(unit
, r
== -ENOENT
|| ERRNO_IS_PRIVILEGE(r
) ? LOG_DEBUG
: LOG_WARNING
, r
,
4438 "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path
);
4439 memory_pressure_path
= mfree(memory_pressure_path
);
4441 } else if (cgroup_context
->memory_pressure_watch
== CGROUP_PRESSURE_WATCH_OFF
) {
4442 memory_pressure_path
= strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
4443 if (!memory_pressure_path
) {
4444 *exit_status
= EXIT_MEMORY
;
4451 needs_mount_namespace
= exec_needs_mount_namespace(context
, params
, runtime
);
4453 for (ExecDirectoryType dt
= 0; dt
< _EXEC_DIRECTORY_TYPE_MAX
; dt
++) {
4454 r
= setup_exec_directory(unit
, context
, params
, uid
, gid
, dt
, needs_mount_namespace
, exit_status
);
4456 return log_unit_error_errno(unit
, r
, "Failed to set up special execution directory in %s: %m", params
->prefix
[dt
]);
4459 if (FLAGS_SET(params
->flags
, EXEC_WRITE_CREDENTIALS
)) {
4460 r
= exec_setup_credentials(context
, params
, unit
->id
, uid
, gid
);
4462 *exit_status
= EXIT_CREDENTIALS
;
4463 return log_unit_error_errno(unit
, r
, "Failed to set up credentials: %m");
4467 r
= build_environment(
4479 memory_pressure_path
,
4482 *exit_status
= EXIT_MEMORY
;
4486 r
= build_pass_environment(context
, &pass_env
);
4488 *exit_status
= EXIT_MEMORY
;
4492 /* The $PATH variable is set to the default path in params->environment. However, this is overridden
4493 * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
4494 * not specify PATH but the unit has ExecSearchPath. */
4495 if (!strv_isempty(context
->exec_search_path
)) {
4496 _cleanup_free_
char *joined
= NULL
;
4498 joined
= strv_join(context
->exec_search_path
, ":");
4500 *exit_status
= EXIT_MEMORY
;
4504 r
= strv_env_assign(&joined_exec_search_path
, "PATH", joined
);
4506 *exit_status
= EXIT_MEMORY
;
4511 accum_env
= strv_env_merge(params
->environment
,
4513 joined_exec_search_path
,
4515 context
->environment
,
4518 *exit_status
= EXIT_MEMORY
;
4521 accum_env
= strv_env_clean(accum_env
);
4523 (void) umask(context
->umask
);
4525 r
= setup_keyring(unit
, context
, params
, uid
, gid
);
4527 *exit_status
= EXIT_KEYRING
;
4528 return log_unit_error_errno(unit
, r
, "Failed to set up kernel keyring: %m");
4531 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
4533 needs_sandboxing
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& EXEC_COMMAND_FULLY_PRIVILEGED
);
4535 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
4536 * for it, and the kernel doesn't actually support ambient caps. */
4537 needs_ambient_hack
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && (command
->flags
& EXEC_COMMAND_AMBIENT_MAGIC
) && !ambient_capabilities_supported();
4539 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
4540 * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
4542 if (needs_ambient_hack
)
4543 needs_setuid
= false;
4545 needs_setuid
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& (EXEC_COMMAND_FULLY_PRIVILEGED
|EXEC_COMMAND_NO_SETUID
));
4547 uint64_t capability_ambient_set
= context
->capability_ambient_set
;
4549 if (needs_sandboxing
) {
4550 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
4551 * /sys being present. The actual MAC context application will happen later, as late as
4552 * possible, to avoid impacting our own code paths. */
4555 use_selinux
= mac_selinux_use();
4558 use_smack
= mac_smack_use();
4561 use_apparmor
= mac_apparmor_use();
4565 if (needs_sandboxing
) {
4568 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4569 * is set here. (See below.) */
4571 r
= setrlimit_closest_all((const struct rlimit
* const *) context
->rlimit
, &which_failed
);
4573 *exit_status
= EXIT_LIMITS
;
4574 return log_unit_error_errno(unit
, r
, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed
));
4578 if (needs_setuid
&& context
->pam_name
&& username
) {
4579 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
4580 * wins here. (See above.) */
4582 /* All fds passed in the fds array will be closed in the pam child process. */
4583 r
= setup_pam(context
->pam_name
, username
, uid
, gid
, context
->tty_path
, &accum_env
, fds
, n_fds
);
4585 *exit_status
= EXIT_PAM
;
4586 return log_unit_error_errno(unit
, r
, "Failed to set up PAM session: %m");
4589 if (ambient_capabilities_supported()) {
4590 uint64_t ambient_after_pam
;
4592 /* PAM modules might have set some ambient caps. Query them here and merge them into
4593 * the caps we want to set in the end, so that we don't end up unsetting them. */
4594 r
= capability_get_ambient(&ambient_after_pam
);
4596 *exit_status
= EXIT_CAPABILITIES
;
4597 return log_unit_error_errno(unit
, r
, "Failed to query ambient caps: %m");
4600 capability_ambient_set
|= ambient_after_pam
;
4603 ngids_after_pam
= getgroups_alloc(&gids_after_pam
);
4604 if (ngids_after_pam
< 0) {
4605 *exit_status
= EXIT_MEMORY
;
4606 return log_unit_error_errno(unit
, ngids_after_pam
, "Failed to obtain groups after setting up PAM: %m");
4610 if (needs_sandboxing
&& exec_context_need_unprivileged_private_users(context
, params
)) {
4611 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4612 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4613 * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
4615 r
= setup_private_users(saved_uid
, saved_gid
, uid
, gid
);
4616 /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
4617 * the actual requested operations fail (or silently continue). */
4618 if (r
< 0 && context
->private_users
) {
4619 *exit_status
= EXIT_USER
;
4620 return log_unit_error_errno(unit
, r
, "Failed to set up user namespacing for unprivileged user: %m");
4623 log_unit_info_errno(unit
, r
, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
4625 userns_set_up
= true;
4628 if (exec_needs_network_namespace(context
) && runtime
&& runtime
->shared
&& runtime
->shared
->netns_storage_socket
[0] >= 0) {
4630 /* Try to enable network namespacing if network namespacing is available and we have
4631 * CAP_NET_ADMIN. We need CAP_NET_ADMIN to be able to configure the loopback device in the
4632 * new network namespace. And if we don't have that, then we could only create a network
4633 * namespace without the ability to set up "lo". Hence gracefully skip things then. */
4634 if (ns_type_supported(NAMESPACE_NET
) && have_effective_cap(CAP_NET_ADMIN
) > 0) {
4635 r
= setup_shareable_ns(runtime
->shared
->netns_storage_socket
, CLONE_NEWNET
);
4636 if (ERRNO_IS_NEG_PRIVILEGE(r
))
4637 log_unit_notice_errno(unit
, r
,
4638 "PrivateNetwork=yes is configured, but network namespace setup not permitted, proceeding without: %m");
4640 *exit_status
= EXIT_NETWORK
;
4641 return log_unit_error_errno(unit
, r
, "Failed to set up network namespacing: %m");
4643 } else if (context
->network_namespace_path
) {
4644 *exit_status
= EXIT_NETWORK
;
4645 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(EOPNOTSUPP
),
4646 "NetworkNamespacePath= is not supported, refusing.");
4648 log_unit_notice(unit
, "PrivateNetwork=yes is configured, but the kernel does not support or we lack privileges for network namespace, proceeding without.");
4651 if (exec_needs_ipc_namespace(context
) && runtime
&& runtime
->shared
&& runtime
->shared
->ipcns_storage_socket
[0] >= 0) {
4653 if (ns_type_supported(NAMESPACE_IPC
)) {
4654 r
= setup_shareable_ns(runtime
->shared
->ipcns_storage_socket
, CLONE_NEWIPC
);
4656 log_unit_warning_errno(unit
, r
,
4657 "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4659 *exit_status
= EXIT_NAMESPACE
;
4660 return log_unit_error_errno(unit
, r
, "Failed to set up IPC namespacing: %m");
4662 } else if (context
->ipc_namespace_path
) {
4663 *exit_status
= EXIT_NAMESPACE
;
4664 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(EOPNOTSUPP
),
4665 "IPCNamespacePath= is not supported, refusing.");
4667 log_unit_warning(unit
, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4670 if (needs_mount_namespace
) {
4671 _cleanup_free_
char *error_path
= NULL
;
4673 r
= apply_mount_namespace(unit
, command
->flags
, context
, params
, runtime
, memory_pressure_path
, &error_path
);
4675 *exit_status
= EXIT_NAMESPACE
;
4676 return log_unit_error_errno(unit
, r
, "Failed to set up mount namespacing%s%s: %m",
4677 error_path
? ": " : "", strempty(error_path
));
4681 if (needs_sandboxing
) {
4682 r
= apply_protect_hostname(unit
, context
, exit_status
);
4687 if (context
->memory_ksm
>= 0)
4688 if (prctl(PR_SET_MEMORY_MERGE
, context
->memory_ksm
) < 0) {
4689 if (ERRNO_IS_NOT_SUPPORTED(errno
))
4690 log_unit_debug_errno(unit
, errno
, "KSM support not available, ignoring.");
4692 *exit_status
= EXIT_KSM
;
4693 return log_unit_error_errno(unit
, errno
, "Failed to set KSM: %m");
4697 /* Drop groups as early as possible.
4698 * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
4699 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
4701 _cleanup_free_ gid_t
*gids_to_enforce
= NULL
;
4702 int ngids_to_enforce
= 0;
4704 ngids_to_enforce
= merge_gid_lists(supplementary_gids
,
4709 if (ngids_to_enforce
< 0) {
4710 *exit_status
= EXIT_MEMORY
;
4711 return log_unit_error_errno(unit
,
4713 "Failed to merge group lists. Group membership might be incorrect: %m");
4716 r
= enforce_groups(gid
, gids_to_enforce
, ngids_to_enforce
);
4718 *exit_status
= EXIT_GROUP
;
4719 return log_unit_error_errno(unit
, r
, "Changing group credentials failed: %m");
4723 /* If the user namespace was not set up above, try to do it now.
4724 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4725 * restricted by rules pertaining to combining user namespaces with other namespaces (e.g. in the
4726 * case of mount namespaces being less privileged when the mount point list is copied from a
4727 * different user namespace). */
4729 if (needs_sandboxing
&& context
->private_users
&& !userns_set_up
) {
4730 r
= setup_private_users(saved_uid
, saved_gid
, uid
, gid
);
4732 *exit_status
= EXIT_USER
;
4733 return log_unit_error_errno(unit
, r
, "Failed to set up user namespacing: %m");
4737 /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4740 _cleanup_free_
char *executable
= NULL
;
4741 _cleanup_close_
int executable_fd
= -EBADF
;
4742 r
= find_executable_full(command
->path
, /* root= */ NULL
, context
->exec_search_path
, false, &executable
, &executable_fd
);
4744 if (r
!= -ENOMEM
&& (command
->flags
& EXEC_COMMAND_IGNORE_FAILURE
)) {
4745 log_unit_struct_errno(unit
, LOG_INFO
, r
,
4746 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
4747 LOG_UNIT_INVOCATION_ID(unit
),
4748 LOG_UNIT_MESSAGE(unit
, "Executable %s missing, skipping: %m",
4750 "EXECUTABLE=%s", command
->path
);
4751 *exit_status
= EXIT_SUCCESS
;
4755 *exit_status
= EXIT_EXEC
;
4756 return log_unit_struct_errno(unit
, LOG_INFO
, r
,
4757 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
4758 LOG_UNIT_INVOCATION_ID(unit
),
4759 LOG_UNIT_MESSAGE(unit
, "Failed to locate executable %s: %m",
4761 "EXECUTABLE=%s", command
->path
);
4764 r
= add_shifted_fd(keep_fds
, ELEMENTSOF(keep_fds
), &n_keep_fds
, executable_fd
, &executable_fd
);
4766 *exit_status
= EXIT_FDS
;
4767 return log_unit_error_errno(unit
, r
, "Failed to shift fd and set FD_CLOEXEC: %m");
4771 if (needs_sandboxing
&& use_selinux
&& params
->selinux_context_net
) {
4776 else if (params
->n_socket_fds
== 1)
4777 /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4778 * use context from that fd to compute the label. */
4779 fd
= params
->fds
[0];
4782 r
= mac_selinux_get_child_mls_label(fd
, executable
, context
->selinux_context
, &mac_selinux_context_net
);
4784 if (!context
->selinux_context_ignore
) {
4785 *exit_status
= EXIT_SELINUX_CONTEXT
;
4786 return log_unit_error_errno(unit
, r
, "Failed to determine SELinux context: %m");
4788 log_unit_debug_errno(unit
, r
, "Failed to determine SELinux context, ignoring: %m");
4794 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that
4795 * we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any
4796 * more. We do keep exec_fd however, if we have it, since we need to keep it open until the final
4799 r
= close_all_fds(keep_fds
, n_keep_fds
);
4801 r
= shift_fds(fds
, n_fds
);
4803 r
= flags_fds(fds
, n_socket_fds
, n_fds
, context
->non_blocking
);
4805 *exit_status
= EXIT_FDS
;
4806 return log_unit_error_errno(unit
, r
, "Failed to adjust passed file descriptors: %m");
4809 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4810 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4811 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4814 secure_bits
= context
->secure_bits
;
4816 if (needs_sandboxing
) {
4819 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested.
4820 * (Note this is placed after the general resource limit initialization, see above, in order
4821 * to take precedence.) */
4822 if (context
->restrict_realtime
&& !context
->rlimit
[RLIMIT_RTPRIO
]) {
4823 if (setrlimit(RLIMIT_RTPRIO
, &RLIMIT_MAKE_CONST(0)) < 0) {
4824 *exit_status
= EXIT_LIMITS
;
4825 return log_unit_error_errno(unit
, errno
, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
4830 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4831 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4833 r
= setup_smack(unit
->manager
, context
, executable_fd
);
4834 if (r
< 0 && !context
->smack_process_label_ignore
) {
4835 *exit_status
= EXIT_SMACK_PROCESS_LABEL
;
4836 return log_unit_error_errno(unit
, r
, "Failed to set SMACK process label: %m");
4841 bset
= context
->capability_bounding_set
;
4842 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4843 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4844 * instead of us doing that */
4845 if (needs_ambient_hack
)
4846 bset
|= (UINT64_C(1) << CAP_SETPCAP
) |
4847 (UINT64_C(1) << CAP_SETUID
) |
4848 (UINT64_C(1) << CAP_SETGID
);
4850 if (!cap_test_all(bset
)) {
4851 r
= capability_bounding_set_drop(bset
, /* right_now= */ false);
4853 *exit_status
= EXIT_CAPABILITIES
;
4854 return log_unit_error_errno(unit
, r
, "Failed to drop capabilities: %m");
4858 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4861 * To be able to raise the ambient capabilities after setresuid() they have to be added to
4862 * the inherited set and keep caps has to be set (done in enforce_user()). After setresuid()
4863 * the ambient capabilities can be raised as they are present in the permitted and
4864 * inhertiable set. However it is possible that someone wants to set ambient capabilities
4865 * without changing the user, so we also set the ambient capabilities here.
4867 * The requested ambient capabilities are raised in the inheritable set if the second
4868 * argument is true. */
4869 if (!needs_ambient_hack
) {
4870 r
= capability_ambient_set_apply(capability_ambient_set
, /* also_inherit= */ true);
4872 *exit_status
= EXIT_CAPABILITIES
;
4873 return log_unit_error_errno(unit
, r
, "Failed to apply ambient capabilities (before UID change): %m");
4878 /* chroot to root directory first, before we lose the ability to chroot */
4879 r
= apply_root_directory(context
, params
, runtime
, needs_mount_namespace
, exit_status
);
4881 return log_unit_error_errno(unit
, r
, "Chrooting to the requested root directory failed: %m");
4884 if (uid_is_valid(uid
)) {
4885 r
= enforce_user(context
, uid
, capability_ambient_set
);
4887 *exit_status
= EXIT_USER
;
4888 return log_unit_error_errno(unit
, r
, "Failed to change UID to " UID_FMT
": %m", uid
);
4891 if (!needs_ambient_hack
&& capability_ambient_set
!= 0) {
4893 /* Raise the ambient capabilities after user change. */
4894 r
= capability_ambient_set_apply(capability_ambient_set
, /* also_inherit= */ false);
4896 *exit_status
= EXIT_CAPABILITIES
;
4897 return log_unit_error_errno(unit
, r
, "Failed to apply ambient capabilities (after UID change): %m");
4903 /* Apply working directory here, because the working directory might be on NFS and only the user running
4904 * this service might have the correct privilege to change to the working directory */
4905 r
= apply_working_directory(context
, params
, runtime
, home
, exit_status
);
4907 return log_unit_error_errno(unit
, r
, "Changing to the requested working directory failed: %m");
4909 if (needs_sandboxing
) {
4910 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
4911 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
4912 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
4913 * are restricted. */
4917 char *exec_context
= mac_selinux_context_net
?: context
->selinux_context
;
4920 r
= setexeccon(exec_context
);
4922 if (!context
->selinux_context_ignore
) {
4923 *exit_status
= EXIT_SELINUX_CONTEXT
;
4924 return log_unit_error_errno(unit
, r
, "Failed to change SELinux context to %s: %m", exec_context
);
4926 log_unit_debug_errno(unit
, r
, "Failed to change SELinux context to %s, ignoring: %m", exec_context
);
4933 if (use_apparmor
&& context
->apparmor_profile
) {
4934 r
= aa_change_onexec(context
->apparmor_profile
);
4935 if (r
< 0 && !context
->apparmor_profile_ignore
) {
4936 *exit_status
= EXIT_APPARMOR_PROFILE
;
4937 return log_unit_error_errno(unit
, errno
, "Failed to prepare AppArmor profile change to %s: %m", context
->apparmor_profile
);
4942 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential
4943 * EPERMs we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits
4944 * requires CAP_SETPCAP. */
4945 if (prctl(PR_GET_SECUREBITS
) != secure_bits
) {
4946 /* CAP_SETPCAP is required to set securebits. This capability is raised into the
4947 * effective set here.
4949 * The effective set is overwritten during execve() with the following values:
4951 * - ambient set (for non-root processes)
4953 * - (inheritable | bounding) set for root processes)
4955 * Hence there is no security impact to raise it in the effective set before execve
4957 r
= capability_gain_cap_setpcap(/* return_caps= */ NULL
);
4959 *exit_status
= EXIT_CAPABILITIES
;
4960 return log_unit_error_errno(unit
, r
, "Failed to gain CAP_SETPCAP for setting secure bits");
4962 if (prctl(PR_SET_SECUREBITS
, secure_bits
) < 0) {
4963 *exit_status
= EXIT_SECUREBITS
;
4964 return log_unit_error_errno(unit
, errno
, "Failed to set process secure bits: %m");
4968 if (context_has_no_new_privileges(context
))
4969 if (prctl(PR_SET_NO_NEW_PRIVS
, 1, 0, 0, 0) < 0) {
4970 *exit_status
= EXIT_NO_NEW_PRIVILEGES
;
4971 return log_unit_error_errno(unit
, errno
, "Failed to disable new privileges: %m");
4975 r
= apply_address_families(unit
, context
);
4977 *exit_status
= EXIT_ADDRESS_FAMILIES
;
4978 return log_unit_error_errno(unit
, r
, "Failed to restrict address families: %m");
4981 r
= apply_memory_deny_write_execute(unit
, context
);
4983 *exit_status
= EXIT_SECCOMP
;
4984 return log_unit_error_errno(unit
, r
, "Failed to disable writing to executable memory: %m");
4987 r
= apply_restrict_realtime(unit
, context
);
4989 *exit_status
= EXIT_SECCOMP
;
4990 return log_unit_error_errno(unit
, r
, "Failed to apply realtime restrictions: %m");
4993 r
= apply_restrict_suid_sgid(unit
, context
);
4995 *exit_status
= EXIT_SECCOMP
;
4996 return log_unit_error_errno(unit
, r
, "Failed to apply SUID/SGID restrictions: %m");
4999 r
= apply_restrict_namespaces(unit
, context
);
5001 *exit_status
= EXIT_SECCOMP
;
5002 return log_unit_error_errno(unit
, r
, "Failed to apply namespace restrictions: %m");
5005 r
= apply_protect_sysctl(unit
, context
);
5007 *exit_status
= EXIT_SECCOMP
;
5008 return log_unit_error_errno(unit
, r
, "Failed to apply sysctl restrictions: %m");
5011 r
= apply_protect_kernel_modules(unit
, context
);
5013 *exit_status
= EXIT_SECCOMP
;
5014 return log_unit_error_errno(unit
, r
, "Failed to apply module loading restrictions: %m");
5017 r
= apply_protect_kernel_logs(unit
, context
);
5019 *exit_status
= EXIT_SECCOMP
;
5020 return log_unit_error_errno(unit
, r
, "Failed to apply kernel log restrictions: %m");
5023 r
= apply_protect_clock(unit
, context
);
5025 *exit_status
= EXIT_SECCOMP
;
5026 return log_unit_error_errno(unit
, r
, "Failed to apply clock restrictions: %m");
5029 r
= apply_private_devices(unit
, context
);
5031 *exit_status
= EXIT_SECCOMP
;
5032 return log_unit_error_errno(unit
, r
, "Failed to set up private devices: %m");
5035 r
= apply_syscall_archs(unit
, context
);
5037 *exit_status
= EXIT_SECCOMP
;
5038 return log_unit_error_errno(unit
, r
, "Failed to apply syscall architecture restrictions: %m");
5041 r
= apply_lock_personality(unit
, context
);
5043 *exit_status
= EXIT_SECCOMP
;
5044 return log_unit_error_errno(unit
, r
, "Failed to lock personalities: %m");
5047 r
= apply_syscall_log(unit
, context
);
5049 *exit_status
= EXIT_SECCOMP
;
5050 return log_unit_error_errno(unit
, r
, "Failed to apply system call log filters: %m");
5053 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
5054 * by the filter as little as possible. */
5055 r
= apply_syscall_filter(unit
, context
, needs_ambient_hack
);
5057 *exit_status
= EXIT_SECCOMP
;
5058 return log_unit_error_errno(unit
, r
, "Failed to apply system call filters: %m");
5063 r
= apply_restrict_filesystems(unit
, context
);
5065 *exit_status
= EXIT_BPF
;
5066 return log_unit_error_errno(unit
, r
, "Failed to restrict filesystems: %m");
5072 if (!strv_isempty(context
->unset_environment
)) {
5075 ee
= strv_env_delete(accum_env
, 1, context
->unset_environment
);
5077 *exit_status
= EXIT_MEMORY
;
5081 strv_free_and_replace(accum_env
, ee
);
5084 if (!FLAGS_SET(command
->flags
, EXEC_COMMAND_NO_ENV_EXPAND
)) {
5085 _cleanup_strv_free_
char **unset_variables
= NULL
, **bad_variables
= NULL
;
5087 r
= replace_env_argv(command
->argv
, accum_env
, &replaced_argv
, &unset_variables
, &bad_variables
);
5089 *exit_status
= EXIT_MEMORY
;
5090 return log_unit_error_errno(unit
, r
, "Failed to replace environment variables: %m");
5092 final_argv
= replaced_argv
;
5094 if (!strv_isempty(unset_variables
)) {
5095 _cleanup_free_
char *ju
= strv_join(unset_variables
, ", ");
5096 log_unit_warning(unit
, "Referenced but unset environment variable evaluates to an empty string: %s", strna(ju
));
5099 if (!strv_isempty(bad_variables
)) {
5100 _cleanup_free_
char *jb
= strv_join(bad_variables
, ", ");
5101 log_unit_warning(unit
, "Invalid environment variable name evaluates to an empty string: %s", strna(jb
));;
5104 final_argv
= command
->argv
;
5106 log_command_line(unit
, "Executing", executable
, final_argv
);
5111 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5112 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5114 if (write(exec_fd
, &hot
, sizeof(hot
)) < 0) {
5115 *exit_status
= EXIT_EXEC
;
5116 return log_unit_error_errno(unit
, errno
, "Failed to enable exec_fd: %m");
5120 r
= fexecve_or_execve(executable_fd
, executable
, final_argv
, accum_env
);
5125 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5126 * that POLLHUP on it no longer means execve() succeeded. */
5128 if (write(exec_fd
, &hot
, sizeof(hot
)) < 0) {
5129 *exit_status
= EXIT_EXEC
;
5130 return log_unit_error_errno(unit
, errno
, "Failed to disable exec_fd: %m");
5134 *exit_status
= EXIT_EXEC
;
5135 return log_unit_error_errno(unit
, r
, "Failed to execute %s: %m", executable
);
5138 static int exec_context_load_environment(const Unit
*unit
, const ExecContext
*c
, char ***l
);
5139 static int exec_context_named_iofds(const ExecContext
*c
, const ExecParameters
*p
, int named_iofds
[static 3]);
5141 int exec_spawn(Unit
*unit
,
5142 ExecCommand
*command
,
5143 const ExecContext
*context
,
5144 const ExecParameters
*params
,
5145 ExecRuntime
*runtime
,
5146 const CGroupContext
*cgroup_context
,
5149 int socket_fd
, r
, named_iofds
[3] = { -1, -1, -1 }, *fds
= NULL
;
5150 _cleanup_free_
char *subcgroup_path
= NULL
;
5151 _cleanup_strv_free_
char **files_env
= NULL
;
5152 size_t n_storage_fds
= 0, n_socket_fds
= 0;
5160 assert(params
->fds
|| (params
->n_socket_fds
+ params
->n_storage_fds
<= 0));
5162 LOG_CONTEXT_PUSH_UNIT(unit
);
5164 if (context
->std_input
== EXEC_INPUT_SOCKET
||
5165 context
->std_output
== EXEC_OUTPUT_SOCKET
||
5166 context
->std_error
== EXEC_OUTPUT_SOCKET
) {
5168 if (params
->n_socket_fds
> 1)
5169 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(EINVAL
), "Got more than one socket.");
5171 if (params
->n_socket_fds
== 0)
5172 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(EINVAL
), "Got no socket.");
5174 socket_fd
= params
->fds
[0];
5178 n_socket_fds
= params
->n_socket_fds
;
5179 n_storage_fds
= params
->n_storage_fds
;
5182 r
= exec_context_named_iofds(context
, params
, named_iofds
);
5184 return log_unit_error_errno(unit
, r
, "Failed to load a named file descriptor: %m");
5186 r
= exec_context_load_environment(unit
, context
, &files_env
);
5188 return log_unit_error_errno(unit
, r
, "Failed to load environment files: %m");
5190 /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
5191 and, until the next SELinux policy changes, we save further reloads in future children. */
5192 mac_selinux_maybe_reload();
5194 /* We won't know the real executable path until we create the mount namespace in the child, but we
5195 want to log from the parent, so we use the possibly inaccurate path here. */
5196 log_command_line(unit
, "About to execute", command
->path
, command
->argv
);
5198 if (params
->cgroup_path
) {
5199 r
= exec_parameters_get_cgroup_path(params
, cgroup_context
, &subcgroup_path
);
5201 return log_unit_error_errno(unit
, r
, "Failed to acquire subcgroup path: %m");
5203 /* If there's a subcgroup, then let's create it here now (the main cgroup was already
5204 * realized by the unit logic) */
5206 r
= cg_create(SYSTEMD_CGROUP_CONTROLLER
, subcgroup_path
);
5208 return log_unit_error_errno(unit
, r
, "Failed to create subcgroup '%s': %m", subcgroup_path
);
5214 return log_unit_error_errno(unit
, errno
, "Failed to fork: %m");
5219 r
= exec_child(unit
,
5231 unit
->manager
->user_lookup_fds
[1],
5235 const char *status
= ASSERT_PTR(
5236 exit_status_to_string(exit_status
, EXIT_STATUS_LIBC
| EXIT_STATUS_SYSTEMD
));
5238 log_unit_struct_errno(unit
, LOG_ERR
, r
,
5239 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
5240 LOG_UNIT_INVOCATION_ID(unit
),
5241 LOG_UNIT_MESSAGE(unit
, "Failed at step %s spawning %s: %m",
5242 status
, command
->path
),
5243 "EXECUTABLE=%s", command
->path
);
5245 assert(exit_status
== EXIT_SUCCESS
);
5250 log_unit_debug(unit
, "Forked %s as "PID_FMT
, command
->path
, pid
);
5252 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
5253 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
5254 * process will be killed too). */
5256 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER
, subcgroup_path
, pid
);
5258 exec_status_start(&command
->exec_status
, pid
);
5264 void exec_context_init(ExecContext
*c
) {
5267 *c
= (ExecContext
) {
5269 .ioprio
= IOPRIO_DEFAULT_CLASS_AND_PRIO
,
5270 .cpu_sched_policy
= SCHED_OTHER
,
5271 .syslog_priority
= LOG_DAEMON
|LOG_INFO
,
5272 .syslog_level_prefix
= true,
5273 .ignore_sigpipe
= true,
5274 .timer_slack_nsec
= NSEC_INFINITY
,
5275 .personality
= PERSONALITY_INVALID
,
5276 .timeout_clean_usec
= USEC_INFINITY
,
5277 .capability_bounding_set
= CAP_MASK_UNSET
,
5278 .restrict_namespaces
= NAMESPACE_FLAGS_INITIAL
,
5279 .log_level_max
= -1,
5281 .syscall_errno
= SECCOMP_ERROR_NUMBER_KILL
,
5283 .tty_rows
= UINT_MAX
,
5284 .tty_cols
= UINT_MAX
,
5285 .private_mounts
= -1,
5287 .set_login_environment
= -1,
5290 FOREACH_ARRAY(d
, c
->directories
, _EXEC_DIRECTORY_TYPE_MAX
)
5293 numa_policy_reset(&c
->numa_policy
);
5295 assert_cc(NAMESPACE_FLAGS_INITIAL
!= NAMESPACE_FLAGS_ALL
);
5298 void exec_context_done(ExecContext
*c
) {
5301 c
->environment
= strv_free(c
->environment
);
5302 c
->environment_files
= strv_free(c
->environment_files
);
5303 c
->pass_environment
= strv_free(c
->pass_environment
);
5304 c
->unset_environment
= strv_free(c
->unset_environment
);
5306 rlimit_free_all(c
->rlimit
);
5308 for (size_t l
= 0; l
< 3; l
++) {
5309 c
->stdio_fdname
[l
] = mfree(c
->stdio_fdname
[l
]);
5310 c
->stdio_file
[l
] = mfree(c
->stdio_file
[l
]);
5313 c
->working_directory
= mfree(c
->working_directory
);
5314 c
->root_directory
= mfree(c
->root_directory
);
5315 c
->root_image
= mfree(c
->root_image
);
5316 c
->root_image_options
= mount_options_free_all(c
->root_image_options
);
5317 c
->root_hash
= mfree(c
->root_hash
);
5318 c
->root_hash_size
= 0;
5319 c
->root_hash_path
= mfree(c
->root_hash_path
);
5320 c
->root_hash_sig
= mfree(c
->root_hash_sig
);
5321 c
->root_hash_sig_size
= 0;
5322 c
->root_hash_sig_path
= mfree(c
->root_hash_sig_path
);
5323 c
->root_verity
= mfree(c
->root_verity
);
5324 c
->extension_images
= mount_image_free_many(c
->extension_images
, &c
->n_extension_images
);
5325 c
->extension_directories
= strv_free(c
->extension_directories
);
5326 c
->tty_path
= mfree(c
->tty_path
);
5327 c
->syslog_identifier
= mfree(c
->syslog_identifier
);
5328 c
->user
= mfree(c
->user
);
5329 c
->group
= mfree(c
->group
);
5331 c
->supplementary_groups
= strv_free(c
->supplementary_groups
);
5333 c
->pam_name
= mfree(c
->pam_name
);
5335 c
->read_only_paths
= strv_free(c
->read_only_paths
);
5336 c
->read_write_paths
= strv_free(c
->read_write_paths
);
5337 c
->inaccessible_paths
= strv_free(c
->inaccessible_paths
);
5338 c
->exec_paths
= strv_free(c
->exec_paths
);
5339 c
->no_exec_paths
= strv_free(c
->no_exec_paths
);
5340 c
->exec_search_path
= strv_free(c
->exec_search_path
);
5342 bind_mount_free_many(c
->bind_mounts
, c
->n_bind_mounts
);
5343 c
->bind_mounts
= NULL
;
5344 c
->n_bind_mounts
= 0;
5345 temporary_filesystem_free_many(c
->temporary_filesystems
, c
->n_temporary_filesystems
);
5346 c
->temporary_filesystems
= NULL
;
5347 c
->n_temporary_filesystems
= 0;
5348 c
->mount_images
= mount_image_free_many(c
->mount_images
, &c
->n_mount_images
);
5350 cpu_set_reset(&c
->cpu_set
);
5351 numa_policy_reset(&c
->numa_policy
);
5353 c
->utmp_id
= mfree(c
->utmp_id
);
5354 c
->selinux_context
= mfree(c
->selinux_context
);
5355 c
->apparmor_profile
= mfree(c
->apparmor_profile
);
5356 c
->smack_process_label
= mfree(c
->smack_process_label
);
5358 c
->restrict_filesystems
= set_free_free(c
->restrict_filesystems
);
5360 c
->syscall_filter
= hashmap_free(c
->syscall_filter
);
5361 c
->syscall_archs
= set_free(c
->syscall_archs
);
5362 c
->address_families
= set_free(c
->address_families
);
5364 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++)
5365 exec_directory_done(&c
->directories
[t
]);
5367 c
->log_level_max
= -1;
5369 exec_context_free_log_extra_fields(c
);
5370 c
->log_filter_allowed_patterns
= set_free_free(c
->log_filter_allowed_patterns
);
5371 c
->log_filter_denied_patterns
= set_free_free(c
->log_filter_denied_patterns
);
5373 c
->log_ratelimit_interval_usec
= 0;
5374 c
->log_ratelimit_burst
= 0;
5376 c
->stdin_data
= mfree(c
->stdin_data
);
5377 c
->stdin_data_size
= 0;
5379 c
->network_namespace_path
= mfree(c
->network_namespace_path
);
5380 c
->ipc_namespace_path
= mfree(c
->ipc_namespace_path
);
5382 c
->log_namespace
= mfree(c
->log_namespace
);
5384 c
->load_credentials
= hashmap_free(c
->load_credentials
);
5385 c
->set_credentials
= hashmap_free(c
->set_credentials
);
5386 c
->import_credentials
= set_free_free(c
->import_credentials
);
5388 c
->root_image_policy
= image_policy_free(c
->root_image_policy
);
5389 c
->mount_image_policy
= image_policy_free(c
->mount_image_policy
);
5390 c
->extension_image_policy
= image_policy_free(c
->extension_image_policy
);
5393 int exec_context_destroy_runtime_directory(const ExecContext
*c
, const char *runtime_prefix
) {
5396 if (!runtime_prefix
)
5399 for (size_t i
= 0; i
< c
->directories
[EXEC_DIRECTORY_RUNTIME
].n_items
; i
++) {
5400 _cleanup_free_
char *p
= NULL
;
5402 if (exec_directory_is_private(c
, EXEC_DIRECTORY_RUNTIME
))
5403 p
= path_join(runtime_prefix
, "private", c
->directories
[EXEC_DIRECTORY_RUNTIME
].items
[i
].path
);
5405 p
= path_join(runtime_prefix
, c
->directories
[EXEC_DIRECTORY_RUNTIME
].items
[i
].path
);
5409 /* We execute this synchronously, since we need to be sure this is gone when we start the
5411 (void) rm_rf(p
, REMOVE_ROOT
);
5413 STRV_FOREACH(symlink
, c
->directories
[EXEC_DIRECTORY_RUNTIME
].items
[i
].symlinks
) {
5414 _cleanup_free_
char *symlink_abs
= NULL
;
5416 if (exec_directory_is_private(c
, EXEC_DIRECTORY_RUNTIME
))
5417 symlink_abs
= path_join(runtime_prefix
, "private", *symlink
);
5419 symlink_abs
= path_join(runtime_prefix
, *symlink
);
5423 (void) unlink(symlink_abs
);
5430 int exec_context_destroy_mount_ns_dir(Unit
*u
) {
5431 _cleanup_free_
char *p
= NULL
;
5433 if (!u
|| !MANAGER_IS_SYSTEM(u
->manager
))
5436 p
= path_join("/run/systemd/propagate/", u
->id
);
5440 /* This is only filled transiently (see mount_in_namespace()), should be empty or even non-existent*/
5441 if (rmdir(p
) < 0 && errno
!= ENOENT
)
5442 log_unit_debug_errno(u
, errno
, "Unable to remove propagation dir '%s', ignoring: %m", p
);
5447 static void exec_command_done(ExecCommand
*c
) {
5450 c
->path
= mfree(c
->path
);
5451 c
->argv
= strv_free(c
->argv
);
5454 void exec_command_done_array(ExecCommand
*c
, size_t n
) {
5455 for (size_t i
= 0; i
< n
; i
++)
5456 exec_command_done(c
+i
);
5459 ExecCommand
* exec_command_free_list(ExecCommand
*c
) {
5462 while ((i
= LIST_POP(command
, c
))) {
5463 exec_command_done(i
);
5470 void exec_command_free_array(ExecCommand
**c
, size_t n
) {
5471 for (size_t i
= 0; i
< n
; i
++)
5472 c
[i
] = exec_command_free_list(c
[i
]);
5475 void exec_command_reset_status_array(ExecCommand
*c
, size_t n
) {
5476 for (size_t i
= 0; i
< n
; i
++)
5477 exec_status_reset(&c
[i
].exec_status
);
5480 void exec_command_reset_status_list_array(ExecCommand
**c
, size_t n
) {
5481 for (size_t i
= 0; i
< n
; i
++)
5482 LIST_FOREACH(command
, z
, c
[i
])
5483 exec_status_reset(&z
->exec_status
);
5486 typedef struct InvalidEnvInfo
{
5491 static void invalid_env(const char *p
, void *userdata
) {
5492 InvalidEnvInfo
*info
= userdata
;
5494 log_unit_error(info
->unit
, "Ignoring invalid environment assignment '%s': %s", p
, info
->path
);
5497 const char* exec_context_fdname(const ExecContext
*c
, int fd_index
) {
5503 if (c
->std_input
!= EXEC_INPUT_NAMED_FD
)
5506 return c
->stdio_fdname
[STDIN_FILENO
] ?: "stdin";
5509 if (c
->std_output
!= EXEC_OUTPUT_NAMED_FD
)
5512 return c
->stdio_fdname
[STDOUT_FILENO
] ?: "stdout";
5515 if (c
->std_error
!= EXEC_OUTPUT_NAMED_FD
)
5518 return c
->stdio_fdname
[STDERR_FILENO
] ?: "stderr";
5525 static int exec_context_named_iofds(
5526 const ExecContext
*c
,
5527 const ExecParameters
*p
,
5528 int named_iofds
[static 3]) {
5531 const char* stdio_fdname
[3];
5536 assert(named_iofds
);
5538 targets
= (c
->std_input
== EXEC_INPUT_NAMED_FD
) +
5539 (c
->std_output
== EXEC_OUTPUT_NAMED_FD
) +
5540 (c
->std_error
== EXEC_OUTPUT_NAMED_FD
);
5542 for (size_t i
= 0; i
< 3; i
++)
5543 stdio_fdname
[i
] = exec_context_fdname(c
, i
);
5545 n_fds
= p
->n_storage_fds
+ p
->n_socket_fds
;
5547 for (size_t i
= 0; i
< n_fds
&& targets
> 0; i
++)
5548 if (named_iofds
[STDIN_FILENO
] < 0 &&
5549 c
->std_input
== EXEC_INPUT_NAMED_FD
&&
5550 stdio_fdname
[STDIN_FILENO
] &&
5551 streq(p
->fd_names
[i
], stdio_fdname
[STDIN_FILENO
])) {
5553 named_iofds
[STDIN_FILENO
] = p
->fds
[i
];
5556 } else if (named_iofds
[STDOUT_FILENO
] < 0 &&
5557 c
->std_output
== EXEC_OUTPUT_NAMED_FD
&&
5558 stdio_fdname
[STDOUT_FILENO
] &&
5559 streq(p
->fd_names
[i
], stdio_fdname
[STDOUT_FILENO
])) {
5561 named_iofds
[STDOUT_FILENO
] = p
->fds
[i
];
5564 } else if (named_iofds
[STDERR_FILENO
] < 0 &&
5565 c
->std_error
== EXEC_OUTPUT_NAMED_FD
&&
5566 stdio_fdname
[STDERR_FILENO
] &&
5567 streq(p
->fd_names
[i
], stdio_fdname
[STDERR_FILENO
])) {
5569 named_iofds
[STDERR_FILENO
] = p
->fds
[i
];
5573 return targets
== 0 ? 0 : -ENOENT
;
5576 static int exec_context_load_environment(const Unit
*unit
, const ExecContext
*c
, char ***ret
) {
5577 _cleanup_strv_free_
char **v
= NULL
;
5583 STRV_FOREACH(i
, c
->environment_files
) {
5584 _cleanup_globfree_ glob_t pglob
= {};
5585 bool ignore
= false;
5593 if (!path_is_absolute(fn
)) {
5599 /* Filename supports globbing, take all matching files */
5600 r
= safe_glob(fn
, 0, &pglob
);
5607 /* When we don't match anything, -ENOENT should be returned */
5608 assert(pglob
.gl_pathc
> 0);
5610 for (size_t n
= 0; n
< pglob
.gl_pathc
; n
++) {
5611 _cleanup_strv_free_
char **p
= NULL
;
5613 r
= load_env_file(NULL
, pglob
.gl_pathv
[n
], &p
);
5620 /* Log invalid environment variables with filename */
5622 InvalidEnvInfo info
= {
5624 .path
= pglob
.gl_pathv
[n
]
5627 p
= strv_env_clean_with_callback(p
, invalid_env
, &info
);
5633 char **m
= strv_env_merge(v
, p
);
5637 strv_free_and_replace(v
, m
);
5647 static bool tty_may_match_dev_console(const char *tty
) {
5648 _cleanup_free_
char *resolved
= NULL
;
5653 tty
= skip_dev_prefix(tty
);
5655 /* trivial identity? */
5656 if (streq(tty
, "console"))
5659 if (resolve_dev_console(&resolved
) < 0)
5660 return true; /* if we could not resolve, assume it may */
5662 /* "tty0" means the active VC, so it may be the same sometimes */
5663 return path_equal(resolved
, tty
) || (streq(resolved
, "tty0") && tty_is_vc(tty
));
5666 static bool exec_context_may_touch_tty(const ExecContext
*ec
) {
5669 return ec
->tty_reset
||
5671 ec
->tty_vt_disallocate
||
5672 is_terminal_input(ec
->std_input
) ||
5673 is_terminal_output(ec
->std_output
) ||
5674 is_terminal_output(ec
->std_error
);
5677 bool exec_context_may_touch_console(const ExecContext
*ec
) {
5679 return exec_context_may_touch_tty(ec
) &&
5680 tty_may_match_dev_console(exec_context_tty_path(ec
));
5683 static void strv_fprintf(FILE *f
, char **l
) {
5687 fprintf(f
, " %s", *g
);
5690 static void strv_dump(FILE* f
, const char *prefix
, const char *name
, char **strv
) {
5695 if (!strv_isempty(strv
)) {
5696 fprintf(f
, "%s%s:", prefix
, name
);
5697 strv_fprintf(f
, strv
);
5702 void exec_context_dump(const ExecContext
*c
, FILE* f
, const char *prefix
) {
5708 prefix
= strempty(prefix
);
5712 "%sWorkingDirectory: %s\n"
5713 "%sRootDirectory: %s\n"
5714 "%sRootEphemeral: %s\n"
5715 "%sNonBlocking: %s\n"
5716 "%sPrivateTmp: %s\n"
5717 "%sPrivateDevices: %s\n"
5718 "%sProtectKernelTunables: %s\n"
5719 "%sProtectKernelModules: %s\n"
5720 "%sProtectKernelLogs: %s\n"
5721 "%sProtectClock: %s\n"
5722 "%sProtectControlGroups: %s\n"
5723 "%sPrivateNetwork: %s\n"
5724 "%sPrivateUsers: %s\n"
5725 "%sProtectHome: %s\n"
5726 "%sProtectSystem: %s\n"
5727 "%sMountAPIVFS: %s\n"
5728 "%sIgnoreSIGPIPE: %s\n"
5729 "%sMemoryDenyWriteExecute: %s\n"
5730 "%sRestrictRealtime: %s\n"
5731 "%sRestrictSUIDSGID: %s\n"
5732 "%sKeyringMode: %s\n"
5733 "%sProtectHostname: %s\n"
5734 "%sProtectProc: %s\n"
5735 "%sProcSubset: %s\n",
5737 prefix
, empty_to_root(c
->working_directory
),
5738 prefix
, empty_to_root(c
->root_directory
),
5739 prefix
, yes_no(c
->root_ephemeral
),
5740 prefix
, yes_no(c
->non_blocking
),
5741 prefix
, yes_no(c
->private_tmp
),
5742 prefix
, yes_no(c
->private_devices
),
5743 prefix
, yes_no(c
->protect_kernel_tunables
),
5744 prefix
, yes_no(c
->protect_kernel_modules
),
5745 prefix
, yes_no(c
->protect_kernel_logs
),
5746 prefix
, yes_no(c
->protect_clock
),
5747 prefix
, yes_no(c
->protect_control_groups
),
5748 prefix
, yes_no(c
->private_network
),
5749 prefix
, yes_no(c
->private_users
),
5750 prefix
, protect_home_to_string(c
->protect_home
),
5751 prefix
, protect_system_to_string(c
->protect_system
),
5752 prefix
, yes_no(exec_context_get_effective_mount_apivfs(c
)),
5753 prefix
, yes_no(c
->ignore_sigpipe
),
5754 prefix
, yes_no(c
->memory_deny_write_execute
),
5755 prefix
, yes_no(c
->restrict_realtime
),
5756 prefix
, yes_no(c
->restrict_suid_sgid
),
5757 prefix
, exec_keyring_mode_to_string(c
->keyring_mode
),
5758 prefix
, yes_no(c
->protect_hostname
),
5759 prefix
, protect_proc_to_string(c
->protect_proc
),
5760 prefix
, proc_subset_to_string(c
->proc_subset
));
5763 fprintf(f
, "%sRootImage: %s\n", prefix
, c
->root_image
);
5765 if (c
->root_image_options
) {
5766 fprintf(f
, "%sRootImageOptions:", prefix
);
5767 LIST_FOREACH(mount_options
, o
, c
->root_image_options
)
5768 if (!isempty(o
->options
))
5769 fprintf(f
, " %s:%s",
5770 partition_designator_to_string(o
->partition_designator
),
5776 _cleanup_free_
char *encoded
= NULL
;
5777 encoded
= hexmem(c
->root_hash
, c
->root_hash_size
);
5779 fprintf(f
, "%sRootHash: %s\n", prefix
, encoded
);
5782 if (c
->root_hash_path
)
5783 fprintf(f
, "%sRootHash: %s\n", prefix
, c
->root_hash_path
);
5785 if (c
->root_hash_sig
) {
5786 _cleanup_free_
char *encoded
= NULL
;
5788 len
= base64mem(c
->root_hash_sig
, c
->root_hash_sig_size
, &encoded
);
5790 fprintf(f
, "%sRootHashSignature: base64:%s\n", prefix
, encoded
);
5793 if (c
->root_hash_sig_path
)
5794 fprintf(f
, "%sRootHashSignature: %s\n", prefix
, c
->root_hash_sig_path
);
5797 fprintf(f
, "%sRootVerity: %s\n", prefix
, c
->root_verity
);
5799 STRV_FOREACH(e
, c
->environment
)
5800 fprintf(f
, "%sEnvironment: %s\n", prefix
, *e
);
5802 STRV_FOREACH(e
, c
->environment_files
)
5803 fprintf(f
, "%sEnvironmentFile: %s\n", prefix
, *e
);
5805 STRV_FOREACH(e
, c
->pass_environment
)
5806 fprintf(f
, "%sPassEnvironment: %s\n", prefix
, *e
);
5808 STRV_FOREACH(e
, c
->unset_environment
)
5809 fprintf(f
, "%sUnsetEnvironment: %s\n", prefix
, *e
);
5811 fprintf(f
, "%sRuntimeDirectoryPreserve: %s\n", prefix
, exec_preserve_mode_to_string(c
->runtime_directory_preserve_mode
));
5813 for (ExecDirectoryType dt
= 0; dt
< _EXEC_DIRECTORY_TYPE_MAX
; dt
++) {
5814 fprintf(f
, "%s%sMode: %04o\n", prefix
, exec_directory_type_to_string(dt
), c
->directories
[dt
].mode
);
5816 for (size_t i
= 0; i
< c
->directories
[dt
].n_items
; i
++) {
5817 fprintf(f
, "%s%s: %s\n", prefix
, exec_directory_type_to_string(dt
), c
->directories
[dt
].items
[i
].path
);
5819 STRV_FOREACH(d
, c
->directories
[dt
].items
[i
].symlinks
)
5820 fprintf(f
, "%s%s: %s:%s\n", prefix
, exec_directory_type_symlink_to_string(dt
), c
->directories
[dt
].items
[i
].path
, *d
);
5824 fprintf(f
, "%sTimeoutCleanSec: %s\n", prefix
, FORMAT_TIMESPAN(c
->timeout_clean_usec
, USEC_PER_SEC
));
5827 fprintf(f
, "%sNice: %i\n", prefix
, c
->nice
);
5829 if (c
->oom_score_adjust_set
)
5830 fprintf(f
, "%sOOMScoreAdjust: %i\n", prefix
, c
->oom_score_adjust
);
5832 if (c
->coredump_filter_set
)
5833 fprintf(f
, "%sCoredumpFilter: 0x%"PRIx64
"\n", prefix
, c
->coredump_filter
);
5835 for (unsigned i
= 0; i
< RLIM_NLIMITS
; i
++)
5837 fprintf(f
, "%sLimit%s: " RLIM_FMT
"\n",
5838 prefix
, rlimit_to_string(i
), c
->rlimit
[i
]->rlim_max
);
5839 fprintf(f
, "%sLimit%sSoft: " RLIM_FMT
"\n",
5840 prefix
, rlimit_to_string(i
), c
->rlimit
[i
]->rlim_cur
);
5843 if (c
->ioprio_set
) {
5844 _cleanup_free_
char *class_str
= NULL
;
5846 r
= ioprio_class_to_string_alloc(ioprio_prio_class(c
->ioprio
), &class_str
);
5848 fprintf(f
, "%sIOSchedulingClass: %s\n", prefix
, class_str
);
5850 fprintf(f
, "%sIOPriority: %d\n", prefix
, ioprio_prio_data(c
->ioprio
));
5853 if (c
->cpu_sched_set
) {
5854 _cleanup_free_
char *policy_str
= NULL
;
5856 r
= sched_policy_to_string_alloc(c
->cpu_sched_policy
, &policy_str
);
5858 fprintf(f
, "%sCPUSchedulingPolicy: %s\n", prefix
, policy_str
);
5861 "%sCPUSchedulingPriority: %i\n"
5862 "%sCPUSchedulingResetOnFork: %s\n",
5863 prefix
, c
->cpu_sched_priority
,
5864 prefix
, yes_no(c
->cpu_sched_reset_on_fork
));
5867 if (c
->cpu_set
.set
) {
5868 _cleanup_free_
char *affinity
= NULL
;
5870 affinity
= cpu_set_to_range_string(&c
->cpu_set
);
5871 fprintf(f
, "%sCPUAffinity: %s\n", prefix
, affinity
);
5874 if (mpol_is_valid(numa_policy_get_type(&c
->numa_policy
))) {
5875 _cleanup_free_
char *nodes
= NULL
;
5877 nodes
= cpu_set_to_range_string(&c
->numa_policy
.nodes
);
5878 fprintf(f
, "%sNUMAPolicy: %s\n", prefix
, mpol_to_string(numa_policy_get_type(&c
->numa_policy
)));
5879 fprintf(f
, "%sNUMAMask: %s\n", prefix
, strnull(nodes
));
5882 if (c
->timer_slack_nsec
!= NSEC_INFINITY
)
5883 fprintf(f
, "%sTimerSlackNSec: "NSEC_FMT
"\n", prefix
, c
->timer_slack_nsec
);
5886 "%sStandardInput: %s\n"
5887 "%sStandardOutput: %s\n"
5888 "%sStandardError: %s\n",
5889 prefix
, exec_input_to_string(c
->std_input
),
5890 prefix
, exec_output_to_string(c
->std_output
),
5891 prefix
, exec_output_to_string(c
->std_error
));
5893 if (c
->std_input
== EXEC_INPUT_NAMED_FD
)
5894 fprintf(f
, "%sStandardInputFileDescriptorName: %s\n", prefix
, c
->stdio_fdname
[STDIN_FILENO
]);
5895 if (c
->std_output
== EXEC_OUTPUT_NAMED_FD
)
5896 fprintf(f
, "%sStandardOutputFileDescriptorName: %s\n", prefix
, c
->stdio_fdname
[STDOUT_FILENO
]);
5897 if (c
->std_error
== EXEC_OUTPUT_NAMED_FD
)
5898 fprintf(f
, "%sStandardErrorFileDescriptorName: %s\n", prefix
, c
->stdio_fdname
[STDERR_FILENO
]);
5900 if (c
->std_input
== EXEC_INPUT_FILE
)
5901 fprintf(f
, "%sStandardInputFile: %s\n", prefix
, c
->stdio_file
[STDIN_FILENO
]);
5902 if (c
->std_output
== EXEC_OUTPUT_FILE
)
5903 fprintf(f
, "%sStandardOutputFile: %s\n", prefix
, c
->stdio_file
[STDOUT_FILENO
]);
5904 if (c
->std_output
== EXEC_OUTPUT_FILE_APPEND
)
5905 fprintf(f
, "%sStandardOutputFileToAppend: %s\n", prefix
, c
->stdio_file
[STDOUT_FILENO
]);
5906 if (c
->std_output
== EXEC_OUTPUT_FILE_TRUNCATE
)
5907 fprintf(f
, "%sStandardOutputFileToTruncate: %s\n", prefix
, c
->stdio_file
[STDOUT_FILENO
]);
5908 if (c
->std_error
== EXEC_OUTPUT_FILE
)
5909 fprintf(f
, "%sStandardErrorFile: %s\n", prefix
, c
->stdio_file
[STDERR_FILENO
]);
5910 if (c
->std_error
== EXEC_OUTPUT_FILE_APPEND
)
5911 fprintf(f
, "%sStandardErrorFileToAppend: %s\n", prefix
, c
->stdio_file
[STDERR_FILENO
]);
5912 if (c
->std_error
== EXEC_OUTPUT_FILE_TRUNCATE
)
5913 fprintf(f
, "%sStandardErrorFileToTruncate: %s\n", prefix
, c
->stdio_file
[STDERR_FILENO
]);
5919 "%sTTYVHangup: %s\n"
5920 "%sTTYVTDisallocate: %s\n"
5922 "%sTTYColumns: %u\n",
5923 prefix
, c
->tty_path
,
5924 prefix
, yes_no(c
->tty_reset
),
5925 prefix
, yes_no(c
->tty_vhangup
),
5926 prefix
, yes_no(c
->tty_vt_disallocate
),
5927 prefix
, c
->tty_rows
,
5928 prefix
, c
->tty_cols
);
5930 if (IN_SET(c
->std_output
,
5932 EXEC_OUTPUT_JOURNAL
,
5933 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
5934 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
) ||
5935 IN_SET(c
->std_error
,
5937 EXEC_OUTPUT_JOURNAL
,
5938 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
5939 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
)) {
5941 _cleanup_free_
char *fac_str
= NULL
, *lvl_str
= NULL
;
5943 r
= log_facility_unshifted_to_string_alloc(c
->syslog_priority
>> 3, &fac_str
);
5945 fprintf(f
, "%sSyslogFacility: %s\n", prefix
, fac_str
);
5947 r
= log_level_to_string_alloc(LOG_PRI(c
->syslog_priority
), &lvl_str
);
5949 fprintf(f
, "%sSyslogLevel: %s\n", prefix
, lvl_str
);
5952 if (c
->log_level_max
>= 0) {
5953 _cleanup_free_
char *t
= NULL
;
5955 (void) log_level_to_string_alloc(c
->log_level_max
, &t
);
5957 fprintf(f
, "%sLogLevelMax: %s\n", prefix
, strna(t
));
5960 if (c
->log_ratelimit_interval_usec
> 0)
5962 "%sLogRateLimitIntervalSec: %s\n",
5963 prefix
, FORMAT_TIMESPAN(c
->log_ratelimit_interval_usec
, USEC_PER_SEC
));
5965 if (c
->log_ratelimit_burst
> 0)
5966 fprintf(f
, "%sLogRateLimitBurst: %u\n", prefix
, c
->log_ratelimit_burst
);
5968 if (!set_isempty(c
->log_filter_allowed_patterns
) || !set_isempty(c
->log_filter_denied_patterns
)) {
5969 fprintf(f
, "%sLogFilterPatterns:", prefix
);
5972 SET_FOREACH(pattern
, c
->log_filter_allowed_patterns
)
5973 fprintf(f
, " %s", pattern
);
5974 SET_FOREACH(pattern
, c
->log_filter_denied_patterns
)
5975 fprintf(f
, " ~%s", pattern
);
5979 for (size_t j
= 0; j
< c
->n_log_extra_fields
; j
++) {
5980 fprintf(f
, "%sLogExtraFields: ", prefix
);
5981 fwrite(c
->log_extra_fields
[j
].iov_base
,
5982 1, c
->log_extra_fields
[j
].iov_len
,
5987 if (c
->log_namespace
)
5988 fprintf(f
, "%sLogNamespace: %s\n", prefix
, c
->log_namespace
);
5990 if (c
->secure_bits
) {
5991 _cleanup_free_
char *str
= NULL
;
5993 r
= secure_bits_to_string_alloc(c
->secure_bits
, &str
);
5995 fprintf(f
, "%sSecure Bits: %s\n", prefix
, str
);
5998 if (c
->capability_bounding_set
!= CAP_MASK_UNSET
) {
5999 _cleanup_free_
char *str
= NULL
;
6001 r
= capability_set_to_string(c
->capability_bounding_set
, &str
);
6003 fprintf(f
, "%sCapabilityBoundingSet: %s\n", prefix
, str
);
6006 if (c
->capability_ambient_set
!= 0) {
6007 _cleanup_free_
char *str
= NULL
;
6009 r
= capability_set_to_string(c
->capability_ambient_set
, &str
);
6011 fprintf(f
, "%sAmbientCapabilities: %s\n", prefix
, str
);
6015 fprintf(f
, "%sUser: %s\n", prefix
, c
->user
);
6017 fprintf(f
, "%sGroup: %s\n", prefix
, c
->group
);
6019 fprintf(f
, "%sDynamicUser: %s\n", prefix
, yes_no(c
->dynamic_user
));
6021 strv_dump(f
, prefix
, "SupplementaryGroups", c
->supplementary_groups
);
6024 fprintf(f
, "%sPAMName: %s\n", prefix
, c
->pam_name
);
6026 strv_dump(f
, prefix
, "ReadWritePaths", c
->read_write_paths
);
6027 strv_dump(f
, prefix
, "ReadOnlyPaths", c
->read_only_paths
);
6028 strv_dump(f
, prefix
, "InaccessiblePaths", c
->inaccessible_paths
);
6029 strv_dump(f
, prefix
, "ExecPaths", c
->exec_paths
);
6030 strv_dump(f
, prefix
, "NoExecPaths", c
->no_exec_paths
);
6031 strv_dump(f
, prefix
, "ExecSearchPath", c
->exec_search_path
);
6033 for (size_t i
= 0; i
< c
->n_bind_mounts
; i
++)
6034 fprintf(f
, "%s%s: %s%s:%s:%s\n", prefix
,
6035 c
->bind_mounts
[i
].read_only
? "BindReadOnlyPaths" : "BindPaths",
6036 c
->bind_mounts
[i
].ignore_enoent
? "-": "",
6037 c
->bind_mounts
[i
].source
,
6038 c
->bind_mounts
[i
].destination
,
6039 c
->bind_mounts
[i
].recursive
? "rbind" : "norbind");
6041 for (size_t i
= 0; i
< c
->n_temporary_filesystems
; i
++) {
6042 const TemporaryFileSystem
*t
= c
->temporary_filesystems
+ i
;
6044 fprintf(f
, "%sTemporaryFileSystem: %s%s%s\n", prefix
,
6046 isempty(t
->options
) ? "" : ":",
6047 strempty(t
->options
));
6052 "%sUtmpIdentifier: %s\n",
6053 prefix
, c
->utmp_id
);
6055 if (c
->selinux_context
)
6057 "%sSELinuxContext: %s%s\n",
6058 prefix
, c
->selinux_context_ignore
? "-" : "", c
->selinux_context
);
6060 if (c
->apparmor_profile
)
6062 "%sAppArmorProfile: %s%s\n",
6063 prefix
, c
->apparmor_profile_ignore
? "-" : "", c
->apparmor_profile
);
6065 if (c
->smack_process_label
)
6067 "%sSmackProcessLabel: %s%s\n",
6068 prefix
, c
->smack_process_label_ignore
? "-" : "", c
->smack_process_label
);
6070 if (c
->personality
!= PERSONALITY_INVALID
)
6072 "%sPersonality: %s\n",
6073 prefix
, strna(personality_to_string(c
->personality
)));
6076 "%sLockPersonality: %s\n",
6077 prefix
, yes_no(c
->lock_personality
));
6079 if (c
->syscall_filter
) {
6081 "%sSystemCallFilter: ",
6084 if (!c
->syscall_allow_list
)
6090 HASHMAP_FOREACH_KEY(val
, id
, c
->syscall_filter
) {
6091 _cleanup_free_
char *name
= NULL
;
6092 const char *errno_name
= NULL
;
6093 int num
= PTR_TO_INT(val
);
6100 name
= seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE
, PTR_TO_INT(id
) - 1);
6101 fputs(strna(name
), f
);
6104 errno_name
= seccomp_errno_or_action_to_string(num
);
6106 fprintf(f
, ":%s", errno_name
);
6108 fprintf(f
, ":%d", num
);
6116 if (c
->syscall_archs
) {
6118 "%sSystemCallArchitectures:",
6123 SET_FOREACH(id
, c
->syscall_archs
)
6124 fprintf(f
, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id
) - 1)));
6129 if (exec_context_restrict_namespaces_set(c
)) {
6130 _cleanup_free_
char *s
= NULL
;
6132 r
= namespace_flags_to_string(c
->restrict_namespaces
, &s
);
6134 fprintf(f
, "%sRestrictNamespaces: %s\n",
6139 if (exec_context_restrict_filesystems_set(c
)) {
6141 SET_FOREACH(fs
, c
->restrict_filesystems
)
6142 fprintf(f
, "%sRestrictFileSystems: %s\n", prefix
, fs
);
6146 if (c
->network_namespace_path
)
6148 "%sNetworkNamespacePath: %s\n",
6149 prefix
, c
->network_namespace_path
);
6151 if (c
->syscall_errno
> 0) {
6152 fprintf(f
, "%sSystemCallErrorNumber: ", prefix
);
6155 const char *errno_name
= seccomp_errno_or_action_to_string(c
->syscall_errno
);
6157 fputs(errno_name
, f
);
6159 fprintf(f
, "%d", c
->syscall_errno
);
6164 for (size_t i
= 0; i
< c
->n_mount_images
; i
++) {
6165 fprintf(f
, "%sMountImages: %s%s:%s", prefix
,
6166 c
->mount_images
[i
].ignore_enoent
? "-": "",
6167 c
->mount_images
[i
].source
,
6168 c
->mount_images
[i
].destination
);
6169 LIST_FOREACH(mount_options
, o
, c
->mount_images
[i
].mount_options
)
6170 fprintf(f
, ":%s:%s",
6171 partition_designator_to_string(o
->partition_designator
),
6172 strempty(o
->options
));
6176 for (size_t i
= 0; i
< c
->n_extension_images
; i
++) {
6177 fprintf(f
, "%sExtensionImages: %s%s", prefix
,
6178 c
->extension_images
[i
].ignore_enoent
? "-": "",
6179 c
->extension_images
[i
].source
);
6180 LIST_FOREACH(mount_options
, o
, c
->extension_images
[i
].mount_options
)
6181 fprintf(f
, ":%s:%s",
6182 partition_designator_to_string(o
->partition_designator
),
6183 strempty(o
->options
));
6187 strv_dump(f
, prefix
, "ExtensionDirectories", c
->extension_directories
);
6190 bool exec_context_maintains_privileges(const ExecContext
*c
) {
6193 /* Returns true if the process forked off would run under
6194 * an unchanged UID or as root. */
6199 if (streq(c
->user
, "root") || streq(c
->user
, "0"))
6205 int exec_context_get_effective_ioprio(const ExecContext
*c
) {
6213 p
= ioprio_get(IOPRIO_WHO_PROCESS
, 0);
6215 return IOPRIO_DEFAULT_CLASS_AND_PRIO
;
6217 return ioprio_normalize(p
);
6220 bool exec_context_get_effective_mount_apivfs(const ExecContext
*c
) {
6223 /* Explicit setting wins */
6224 if (c
->mount_apivfs_set
)
6225 return c
->mount_apivfs
;
6227 /* Default to "yes" if root directory or image are specified */
6228 if (exec_context_with_rootfs(c
))
6234 void exec_context_free_log_extra_fields(ExecContext
*c
) {
6237 for (size_t l
= 0; l
< c
->n_log_extra_fields
; l
++)
6238 free(c
->log_extra_fields
[l
].iov_base
);
6239 c
->log_extra_fields
= mfree(c
->log_extra_fields
);
6240 c
->n_log_extra_fields
= 0;
6243 void exec_context_revert_tty(ExecContext
*c
) {
6244 _cleanup_close_
int fd
= -EBADF
;
6251 /* First, reset the TTY (possibly kicking everybody else from the TTY) */
6252 exec_context_tty_reset(c
, NULL
);
6254 /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
6255 * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
6256 * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
6257 if (!exec_context_may_touch_tty(c
))
6260 path
= exec_context_tty_path(c
);
6264 fd
= open(path
, O_PATH
|O_CLOEXEC
);
6266 return (void) log_full_errno(errno
== ENOENT
? LOG_DEBUG
: LOG_WARNING
, errno
,
6267 "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
6270 if (fstat(fd
, &st
) < 0)
6271 return (void) log_warning_errno(errno
, "Failed to stat TTY '%s', ignoring: %m", path
);
6273 /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
6274 * if things are a character device, since a proper check either means we'd have to open the TTY and
6275 * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
6276 * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
6277 * with this at all? → https://github.com/systemd/systemd/issues/19213 */
6278 if (!S_ISCHR(st
.st_mode
))
6279 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path
);
6281 r
= fchmod_and_chown(fd
, TTY_MODE
, 0, TTY_GID
);
6283 log_warning_errno(r
, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path
);
6286 int exec_context_get_clean_directories(
6292 _cleanup_strv_free_
char **l
= NULL
;
6299 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
6300 if (!FLAGS_SET(mask
, 1U << t
))
6306 for (size_t i
= 0; i
< c
->directories
[t
].n_items
; i
++) {
6309 j
= path_join(prefix
[t
], c
->directories
[t
].items
[i
].path
);
6313 r
= strv_consume(&l
, j
);
6317 /* Also remove private directories unconditionally. */
6318 if (t
!= EXEC_DIRECTORY_CONFIGURATION
) {
6319 j
= path_join(prefix
[t
], "private", c
->directories
[t
].items
[i
].path
);
6323 r
= strv_consume(&l
, j
);
6328 STRV_FOREACH(symlink
, c
->directories
[t
].items
[i
].symlinks
) {
6329 j
= path_join(prefix
[t
], *symlink
);
6333 r
= strv_consume(&l
, j
);
6344 int exec_context_get_clean_mask(ExecContext
*c
, ExecCleanMask
*ret
) {
6345 ExecCleanMask mask
= 0;
6350 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++)
6351 if (c
->directories
[t
].n_items
> 0)
6358 void exec_status_start(ExecStatus
*s
, pid_t pid
) {
6365 dual_timestamp_get(&s
->start_timestamp
);
6368 void exec_status_exit(ExecStatus
*s
, const ExecContext
*context
, pid_t pid
, int code
, int status
) {
6376 dual_timestamp_get(&s
->exit_timestamp
);
6381 if (context
&& context
->utmp_id
)
6382 (void) utmp_put_dead_process(context
->utmp_id
, pid
, code
, status
);
6385 void exec_status_reset(ExecStatus
*s
) {
6388 *s
= (ExecStatus
) {};
6391 void exec_status_dump(const ExecStatus
*s
, FILE *f
, const char *prefix
) {
6398 prefix
= strempty(prefix
);
6401 "%sPID: "PID_FMT
"\n",
6404 if (dual_timestamp_is_set(&s
->start_timestamp
))
6406 "%sStart Timestamp: %s\n",
6407 prefix
, FORMAT_TIMESTAMP(s
->start_timestamp
.realtime
));
6409 if (dual_timestamp_is_set(&s
->exit_timestamp
))
6411 "%sExit Timestamp: %s\n"
6413 "%sExit Status: %i\n",
6414 prefix
, FORMAT_TIMESTAMP(s
->exit_timestamp
.realtime
),
6415 prefix
, sigchld_code_to_string(s
->code
),
6419 static void exec_command_dump(ExecCommand
*c
, FILE *f
, const char *prefix
) {
6420 _cleanup_free_
char *cmd
= NULL
;
6421 const char *prefix2
;
6426 prefix
= strempty(prefix
);
6427 prefix2
= strjoina(prefix
, "\t");
6429 cmd
= quote_command_line(c
->argv
, SHELL_ESCAPE_EMPTY
);
6432 "%sCommand Line: %s\n",
6433 prefix
, strnull(cmd
));
6435 exec_status_dump(&c
->exec_status
, f
, prefix2
);
6438 void exec_command_dump_list(ExecCommand
*c
, FILE *f
, const char *prefix
) {
6441 prefix
= strempty(prefix
);
6443 LIST_FOREACH(command
, i
, c
)
6444 exec_command_dump(i
, f
, prefix
);
6447 void exec_command_append_list(ExecCommand
**l
, ExecCommand
*e
) {
6454 /* It's kind of important, that we keep the order here */
6455 end
= LIST_FIND_TAIL(command
, *l
);
6456 LIST_INSERT_AFTER(command
, *l
, end
, e
);
6461 int exec_command_set(ExecCommand
*c
, const char *path
, ...) {
6469 l
= strv_new_ap(path
, ap
);
6481 free_and_replace(c
->path
, p
);
6483 return strv_free_and_replace(c
->argv
, l
);
6486 int exec_command_append(ExecCommand
*c
, const char *path
, ...) {
6487 _cleanup_strv_free_
char **l
= NULL
;
6495 l
= strv_new_ap(path
, ap
);
6501 r
= strv_extend_strv(&c
->argv
, l
, false);
6508 static char *destroy_tree(char *path
) {
6512 if (!path_equal(path
, RUN_SYSTEMD_EMPTY
)) {
6513 log_debug("Spawning process to nuke '%s'", path
);
6515 (void) asynchronous_rm_rf(path
, REMOVE_ROOT
|REMOVE_SUBVOLUME
|REMOVE_PHYSICAL
);
6521 static ExecSharedRuntime
* exec_shared_runtime_free(ExecSharedRuntime
*rt
) {
6526 (void) hashmap_remove(rt
->manager
->exec_shared_runtime_by_id
, rt
->id
);
6528 rt
->id
= mfree(rt
->id
);
6529 rt
->tmp_dir
= mfree(rt
->tmp_dir
);
6530 rt
->var_tmp_dir
= mfree(rt
->var_tmp_dir
);
6531 safe_close_pair(rt
->netns_storage_socket
);
6532 safe_close_pair(rt
->ipcns_storage_socket
);
6536 DEFINE_TRIVIAL_UNREF_FUNC(ExecSharedRuntime
, exec_shared_runtime
, exec_shared_runtime_free
);
6537 DEFINE_TRIVIAL_CLEANUP_FUNC(ExecSharedRuntime
*, exec_shared_runtime_free
);
6539 ExecSharedRuntime
* exec_shared_runtime_destroy(ExecSharedRuntime
*rt
) {
6543 assert(rt
->n_ref
> 0);
6549 rt
->tmp_dir
= destroy_tree(rt
->tmp_dir
);
6550 rt
->var_tmp_dir
= destroy_tree(rt
->var_tmp_dir
);
6552 return exec_shared_runtime_free(rt
);
6555 static int exec_shared_runtime_allocate(ExecSharedRuntime
**ret
, const char *id
) {
6556 _cleanup_free_
char *id_copy
= NULL
;
6557 ExecSharedRuntime
*n
;
6561 id_copy
= strdup(id
);
6565 n
= new(ExecSharedRuntime
, 1);
6569 *n
= (ExecSharedRuntime
) {
6570 .id
= TAKE_PTR(id_copy
),
6571 .netns_storage_socket
= PIPE_EBADF
,
6572 .ipcns_storage_socket
= PIPE_EBADF
,
6579 static int exec_shared_runtime_add(
6584 int netns_storage_socket
[2],
6585 int ipcns_storage_socket
[2],
6586 ExecSharedRuntime
**ret
) {
6588 _cleanup_(exec_shared_runtime_freep
) ExecSharedRuntime
*rt
= NULL
;
6594 /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
6596 r
= exec_shared_runtime_allocate(&rt
, id
);
6600 r
= hashmap_ensure_put(&m
->exec_shared_runtime_by_id
, &string_hash_ops
, rt
->id
, rt
);
6604 assert(!!rt
->tmp_dir
== !!rt
->var_tmp_dir
); /* We require both to be set together */
6605 rt
->tmp_dir
= TAKE_PTR(*tmp_dir
);
6606 rt
->var_tmp_dir
= TAKE_PTR(*var_tmp_dir
);
6608 if (netns_storage_socket
) {
6609 rt
->netns_storage_socket
[0] = TAKE_FD(netns_storage_socket
[0]);
6610 rt
->netns_storage_socket
[1] = TAKE_FD(netns_storage_socket
[1]);
6613 if (ipcns_storage_socket
) {
6614 rt
->ipcns_storage_socket
[0] = TAKE_FD(ipcns_storage_socket
[0]);
6615 rt
->ipcns_storage_socket
[1] = TAKE_FD(ipcns_storage_socket
[1]);
6622 /* do not remove created ExecSharedRuntime object when the operation succeeds. */
6627 static int exec_shared_runtime_make(
6629 const ExecContext
*c
,
6631 ExecSharedRuntime
**ret
) {
6633 _cleanup_(namespace_cleanup_tmpdirp
) char *tmp_dir
= NULL
, *var_tmp_dir
= NULL
;
6634 _cleanup_close_pair_
int netns_storage_socket
[2] = PIPE_EBADF
, ipcns_storage_socket
[2] = PIPE_EBADF
;
6641 /* It is not necessary to create ExecSharedRuntime object. */
6642 if (!exec_needs_network_namespace(c
) && !exec_needs_ipc_namespace(c
) && !c
->private_tmp
) {
6647 if (c
->private_tmp
&&
6648 !(prefixed_path_strv_contains(c
->inaccessible_paths
, "/tmp") &&
6649 (prefixed_path_strv_contains(c
->inaccessible_paths
, "/var/tmp") ||
6650 prefixed_path_strv_contains(c
->inaccessible_paths
, "/var")))) {
6651 r
= setup_tmp_dirs(id
, &tmp_dir
, &var_tmp_dir
);
6656 if (exec_needs_network_namespace(c
)) {
6657 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, netns_storage_socket
) < 0)
6661 if (exec_needs_ipc_namespace(c
)) {
6662 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, ipcns_storage_socket
) < 0)
6666 r
= exec_shared_runtime_add(m
, id
, &tmp_dir
, &var_tmp_dir
, netns_storage_socket
, ipcns_storage_socket
, ret
);
6673 int exec_shared_runtime_acquire(Manager
*m
, const ExecContext
*c
, const char *id
, bool create
, ExecSharedRuntime
**ret
) {
6674 ExecSharedRuntime
*rt
;
6681 rt
= hashmap_get(m
->exec_shared_runtime_by_id
, id
);
6683 /* We already have an ExecSharedRuntime object, let's increase the ref count and reuse it */
6691 /* If not found, then create a new object. */
6692 r
= exec_shared_runtime_make(m
, c
, id
, &rt
);
6696 /* When r == 0, it is not necessary to create ExecSharedRuntime object. */
6702 /* increment reference counter. */
6708 int exec_shared_runtime_serialize(const Manager
*m
, FILE *f
, FDSet
*fds
) {
6709 ExecSharedRuntime
*rt
;
6715 HASHMAP_FOREACH(rt
, m
->exec_shared_runtime_by_id
) {
6716 fprintf(f
, "exec-runtime=%s", rt
->id
);
6719 fprintf(f
, " tmp-dir=%s", rt
->tmp_dir
);
6721 if (rt
->var_tmp_dir
)
6722 fprintf(f
, " var-tmp-dir=%s", rt
->var_tmp_dir
);
6724 if (rt
->netns_storage_socket
[0] >= 0) {
6727 copy
= fdset_put_dup(fds
, rt
->netns_storage_socket
[0]);
6731 fprintf(f
, " netns-socket-0=%i", copy
);
6734 if (rt
->netns_storage_socket
[1] >= 0) {
6737 copy
= fdset_put_dup(fds
, rt
->netns_storage_socket
[1]);
6741 fprintf(f
, " netns-socket-1=%i", copy
);
6744 if (rt
->ipcns_storage_socket
[0] >= 0) {
6747 copy
= fdset_put_dup(fds
, rt
->ipcns_storage_socket
[0]);
6751 fprintf(f
, " ipcns-socket-0=%i", copy
);
6754 if (rt
->ipcns_storage_socket
[1] >= 0) {
6757 copy
= fdset_put_dup(fds
, rt
->ipcns_storage_socket
[1]);
6761 fprintf(f
, " ipcns-socket-1=%i", copy
);
6770 int exec_shared_runtime_deserialize_compat(Unit
*u
, const char *key
, const char *value
, FDSet
*fds
) {
6771 _cleanup_(exec_shared_runtime_freep
) ExecSharedRuntime
*rt_create
= NULL
;
6772 ExecSharedRuntime
*rt
;
6775 /* This is for the migration from old (v237 or earlier) deserialization text.
6776 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
6777 * Even if the ExecSharedRuntime object originally created by the other unit, we cannot judge
6778 * so or not from the serialized text, then we always creates a new object owned by this. */
6784 /* Manager manages ExecSharedRuntime objects by the unit id.
6785 * So, we omit the serialized text when the unit does not have id (yet?)... */
6786 if (isempty(u
->id
)) {
6787 log_unit_debug(u
, "Invocation ID not found. Dropping runtime parameter.");
6791 if (hashmap_ensure_allocated(&u
->manager
->exec_shared_runtime_by_id
, &string_hash_ops
) < 0)
6794 rt
= hashmap_get(u
->manager
->exec_shared_runtime_by_id
, u
->id
);
6796 if (exec_shared_runtime_allocate(&rt_create
, u
->id
) < 0)
6802 if (streq(key
, "tmp-dir")) {
6803 if (free_and_strdup_warn(&rt
->tmp_dir
, value
) < 0)
6806 } else if (streq(key
, "var-tmp-dir")) {
6807 if (free_and_strdup_warn(&rt
->var_tmp_dir
, value
) < 0)
6810 } else if (streq(key
, "netns-socket-0")) {
6813 if ((fd
= parse_fd(value
)) < 0 || !fdset_contains(fds
, fd
)) {
6814 log_unit_debug(u
, "Failed to parse netns socket value: %s", value
);
6818 safe_close(rt
->netns_storage_socket
[0]);
6819 rt
->netns_storage_socket
[0] = fdset_remove(fds
, fd
);
6821 } else if (streq(key
, "netns-socket-1")) {
6824 if ((fd
= parse_fd(value
)) < 0 || !fdset_contains(fds
, fd
)) {
6825 log_unit_debug(u
, "Failed to parse netns socket value: %s", value
);
6829 safe_close(rt
->netns_storage_socket
[1]);
6830 rt
->netns_storage_socket
[1] = fdset_remove(fds
, fd
);
6835 /* If the object is newly created, then put it to the hashmap which manages ExecSharedRuntime objects. */
6837 r
= hashmap_put(u
->manager
->exec_shared_runtime_by_id
, rt_create
->id
, rt_create
);
6839 log_unit_debug_errno(u
, r
, "Failed to put runtime parameter to manager's storage: %m");
6843 rt_create
->manager
= u
->manager
;
6846 TAKE_PTR(rt_create
);
6852 int exec_shared_runtime_deserialize_one(Manager
*m
, const char *value
, FDSet
*fds
) {
6853 _cleanup_free_
char *tmp_dir
= NULL
, *var_tmp_dir
= NULL
;
6855 int r
, netns_fdpair
[] = {-1, -1}, ipcns_fdpair
[] = {-1, -1};
6856 const char *p
, *v
= ASSERT_PTR(value
);
6862 n
= strcspn(v
, " ");
6863 id
= strndupa_safe(v
, n
);
6868 v
= startswith(p
, "tmp-dir=");
6870 n
= strcspn(v
, " ");
6871 tmp_dir
= strndup(v
, n
);
6879 v
= startswith(p
, "var-tmp-dir=");
6881 n
= strcspn(v
, " ");
6882 var_tmp_dir
= strndup(v
, n
);
6890 v
= startswith(p
, "netns-socket-0=");
6894 n
= strcspn(v
, " ");
6895 buf
= strndupa_safe(v
, n
);
6897 netns_fdpair
[0] = parse_fd(buf
);
6898 if (netns_fdpair
[0] < 0)
6899 return log_debug_errno(netns_fdpair
[0], "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf
);
6900 if (!fdset_contains(fds
, netns_fdpair
[0]))
6901 return log_debug_errno(SYNTHETIC_ERRNO(EBADF
),
6902 "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair
[0]);
6903 netns_fdpair
[0] = fdset_remove(fds
, netns_fdpair
[0]);
6909 v
= startswith(p
, "netns-socket-1=");
6913 n
= strcspn(v
, " ");
6914 buf
= strndupa_safe(v
, n
);
6916 netns_fdpair
[1] = parse_fd(buf
);
6917 if (netns_fdpair
[1] < 0)
6918 return log_debug_errno(netns_fdpair
[1], "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf
);
6919 if (!fdset_contains(fds
, netns_fdpair
[1]))
6920 return log_debug_errno(SYNTHETIC_ERRNO(EBADF
),
6921 "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair
[1]);
6922 netns_fdpair
[1] = fdset_remove(fds
, netns_fdpair
[1]);
6928 v
= startswith(p
, "ipcns-socket-0=");
6932 n
= strcspn(v
, " ");
6933 buf
= strndupa_safe(v
, n
);
6935 ipcns_fdpair
[0] = parse_fd(buf
);
6936 if (ipcns_fdpair
[0] < 0)
6937 return log_debug_errno(ipcns_fdpair
[0], "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf
);
6938 if (!fdset_contains(fds
, ipcns_fdpair
[0]))
6939 return log_debug_errno(SYNTHETIC_ERRNO(EBADF
),
6940 "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair
[0]);
6941 ipcns_fdpair
[0] = fdset_remove(fds
, ipcns_fdpair
[0]);
6947 v
= startswith(p
, "ipcns-socket-1=");
6951 n
= strcspn(v
, " ");
6952 buf
= strndupa_safe(v
, n
);
6954 ipcns_fdpair
[1] = parse_fd(buf
);
6955 if (ipcns_fdpair
[1] < 0)
6956 return log_debug_errno(ipcns_fdpair
[1], "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf
);
6957 if (!fdset_contains(fds
, ipcns_fdpair
[1]))
6958 return log_debug_errno(SYNTHETIC_ERRNO(EBADF
),
6959 "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair
[1]);
6960 ipcns_fdpair
[1] = fdset_remove(fds
, ipcns_fdpair
[1]);
6964 r
= exec_shared_runtime_add(m
, id
, &tmp_dir
, &var_tmp_dir
, netns_fdpair
, ipcns_fdpair
, NULL
);
6966 return log_debug_errno(r
, "Failed to add exec-runtime: %m");
6970 void exec_shared_runtime_vacuum(Manager
*m
) {
6971 ExecSharedRuntime
*rt
;
6975 /* Free unreferenced ExecSharedRuntime objects. This is used after manager deserialization process. */
6977 HASHMAP_FOREACH(rt
, m
->exec_shared_runtime_by_id
) {
6981 (void) exec_shared_runtime_free(rt
);
6985 int exec_runtime_make(
6987 const ExecContext
*context
,
6988 ExecSharedRuntime
*shared
,
6989 DynamicCreds
*creds
,
6990 ExecRuntime
**ret
) {
6991 _cleanup_close_pair_
int ephemeral_storage_socket
[2] = PIPE_EBADF
;
6992 _cleanup_free_
char *ephemeral
= NULL
;
6993 _cleanup_(exec_runtime_freep
) ExecRuntime
*rt
= NULL
;
7000 if (!shared
&& !creds
&& !exec_needs_ephemeral(context
)) {
7005 if (exec_needs_ephemeral(context
)) {
7006 r
= mkdir_p("/var/lib/systemd/ephemeral-trees", 0755);
7010 r
= tempfn_random_child("/var/lib/systemd/ephemeral-trees", unit
->id
, &ephemeral
);
7014 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, ephemeral_storage_socket
) < 0)
7018 rt
= new(ExecRuntime
, 1);
7022 *rt
= (ExecRuntime
) {
7024 .dynamic_creds
= creds
,
7025 .ephemeral_copy
= TAKE_PTR(ephemeral
),
7026 .ephemeral_storage_socket
[0] = TAKE_FD(ephemeral_storage_socket
[0]),
7027 .ephemeral_storage_socket
[1] = TAKE_FD(ephemeral_storage_socket
[1]),
7030 *ret
= TAKE_PTR(rt
);
7034 ExecRuntime
* exec_runtime_free(ExecRuntime
*rt
) {
7038 exec_shared_runtime_unref(rt
->shared
);
7039 dynamic_creds_unref(rt
->dynamic_creds
);
7041 rt
->ephemeral_copy
= destroy_tree(rt
->ephemeral_copy
);
7043 safe_close_pair(rt
->ephemeral_storage_socket
);
7047 ExecRuntime
* exec_runtime_destroy(ExecRuntime
*rt
) {
7051 rt
->shared
= exec_shared_runtime_destroy(rt
->shared
);
7052 rt
->dynamic_creds
= dynamic_creds_destroy(rt
->dynamic_creds
);
7053 return exec_runtime_free(rt
);
7056 void exec_params_clear(ExecParameters
*p
) {
7060 p
->environment
= strv_free(p
->environment
);
7061 p
->fd_names
= strv_free(p
->fd_names
);
7062 p
->fds
= mfree(p
->fds
);
7063 p
->exec_fd
= safe_close(p
->exec_fd
);
7066 void exec_directory_done(ExecDirectory
*d
) {
7070 for (size_t i
= 0; i
< d
->n_items
; i
++) {
7071 free(d
->items
[i
].path
);
7072 strv_free(d
->items
[i
].symlinks
);
7075 d
->items
= mfree(d
->items
);
7080 static ExecDirectoryItem
*exec_directory_find(ExecDirectory
*d
, const char *path
) {
7084 for (size_t i
= 0; i
< d
->n_items
; i
++)
7085 if (path_equal(d
->items
[i
].path
, path
))
7086 return &d
->items
[i
];
7091 int exec_directory_add(ExecDirectory
*d
, const char *path
, const char *symlink
) {
7092 _cleanup_strv_free_
char **s
= NULL
;
7093 _cleanup_free_
char *p
= NULL
;
7094 ExecDirectoryItem
*existing
;
7100 existing
= exec_directory_find(d
, path
);
7102 r
= strv_extend(&existing
->symlinks
, symlink
);
7106 return 0; /* existing item is updated */
7114 s
= strv_new(symlink
);
7119 if (!GREEDY_REALLOC(d
->items
, d
->n_items
+ 1))
7122 d
->items
[d
->n_items
++] = (ExecDirectoryItem
) {
7123 .path
= TAKE_PTR(p
),
7124 .symlinks
= TAKE_PTR(s
),
7127 return 1; /* new item is added */
7130 static int exec_directory_item_compare_func(const ExecDirectoryItem
*a
, const ExecDirectoryItem
*b
) {
7134 return path_compare(a
->path
, b
->path
);
7137 void exec_directory_sort(ExecDirectory
*d
) {
7140 /* Sort the exec directories to make always parent directories processed at first in
7141 * setup_exec_directory(), e.g., even if StateDirectory=foo/bar foo, we need to create foo at first,
7142 * then foo/bar. Also, set .only_create flag if one of the parent directories is contained in the
7143 * list. See also comments in setup_exec_directory() and issue #24783. */
7145 if (d
->n_items
<= 1)
7148 typesafe_qsort(d
->items
, d
->n_items
, exec_directory_item_compare_func
);
7150 for (size_t i
= 1; i
< d
->n_items
; i
++)
7151 for (size_t j
= 0; j
< i
; j
++)
7152 if (path_startswith(d
->items
[i
].path
, d
->items
[j
].path
)) {
7153 d
->items
[i
].only_create
= true;
7158 ExecCleanMask
exec_clean_mask_from_string(const char *s
) {
7159 ExecDirectoryType t
;
7163 if (streq(s
, "all"))
7164 return EXEC_CLEAN_ALL
;
7165 if (streq(s
, "fdstore"))
7166 return EXEC_CLEAN_FDSTORE
;
7168 t
= exec_resource_type_from_string(s
);
7170 return (ExecCleanMask
) t
;
7175 static const char* const exec_input_table
[_EXEC_INPUT_MAX
] = {
7176 [EXEC_INPUT_NULL
] = "null",
7177 [EXEC_INPUT_TTY
] = "tty",
7178 [EXEC_INPUT_TTY_FORCE
] = "tty-force",
7179 [EXEC_INPUT_TTY_FAIL
] = "tty-fail",
7180 [EXEC_INPUT_SOCKET
] = "socket",
7181 [EXEC_INPUT_NAMED_FD
] = "fd",
7182 [EXEC_INPUT_DATA
] = "data",
7183 [EXEC_INPUT_FILE
] = "file",
7186 DEFINE_STRING_TABLE_LOOKUP(exec_input
, ExecInput
);
7188 static const char* const exec_output_table
[_EXEC_OUTPUT_MAX
] = {
7189 [EXEC_OUTPUT_INHERIT
] = "inherit",
7190 [EXEC_OUTPUT_NULL
] = "null",
7191 [EXEC_OUTPUT_TTY
] = "tty",
7192 [EXEC_OUTPUT_KMSG
] = "kmsg",
7193 [EXEC_OUTPUT_KMSG_AND_CONSOLE
] = "kmsg+console",
7194 [EXEC_OUTPUT_JOURNAL
] = "journal",
7195 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE
] = "journal+console",
7196 [EXEC_OUTPUT_SOCKET
] = "socket",
7197 [EXEC_OUTPUT_NAMED_FD
] = "fd",
7198 [EXEC_OUTPUT_FILE
] = "file",
7199 [EXEC_OUTPUT_FILE_APPEND
] = "append",
7200 [EXEC_OUTPUT_FILE_TRUNCATE
] = "truncate",
7203 DEFINE_STRING_TABLE_LOOKUP(exec_output
, ExecOutput
);
7205 static const char* const exec_utmp_mode_table
[_EXEC_UTMP_MODE_MAX
] = {
7206 [EXEC_UTMP_INIT
] = "init",
7207 [EXEC_UTMP_LOGIN
] = "login",
7208 [EXEC_UTMP_USER
] = "user",
7211 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode
, ExecUtmpMode
);
7213 static const char* const exec_preserve_mode_table
[_EXEC_PRESERVE_MODE_MAX
] = {
7214 [EXEC_PRESERVE_NO
] = "no",
7215 [EXEC_PRESERVE_YES
] = "yes",
7216 [EXEC_PRESERVE_RESTART
] = "restart",
7219 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode
, ExecPreserveMode
, EXEC_PRESERVE_YES
);
7221 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
7222 static const char* const exec_directory_type_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
7223 [EXEC_DIRECTORY_RUNTIME
] = "RuntimeDirectory",
7224 [EXEC_DIRECTORY_STATE
] = "StateDirectory",
7225 [EXEC_DIRECTORY_CACHE
] = "CacheDirectory",
7226 [EXEC_DIRECTORY_LOGS
] = "LogsDirectory",
7227 [EXEC_DIRECTORY_CONFIGURATION
] = "ConfigurationDirectory",
7230 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type
, ExecDirectoryType
);
7232 /* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
7233 static const char* const exec_directory_type_symlink_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
7234 [EXEC_DIRECTORY_RUNTIME
] = "RuntimeDirectorySymlink",
7235 [EXEC_DIRECTORY_STATE
] = "StateDirectorySymlink",
7236 [EXEC_DIRECTORY_CACHE
] = "CacheDirectorySymlink",
7237 [EXEC_DIRECTORY_LOGS
] = "LogsDirectorySymlink",
7238 [EXEC_DIRECTORY_CONFIGURATION
] = "ConfigurationDirectorySymlink",
7241 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink
, ExecDirectoryType
);
7243 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
7244 * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
7245 * directories, specifically .timer units with their timestamp touch file. */
7246 static const char* const exec_resource_type_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
7247 [EXEC_DIRECTORY_RUNTIME
] = "runtime",
7248 [EXEC_DIRECTORY_STATE
] = "state",
7249 [EXEC_DIRECTORY_CACHE
] = "cache",
7250 [EXEC_DIRECTORY_LOGS
] = "logs",
7251 [EXEC_DIRECTORY_CONFIGURATION
] = "configuration",
7254 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type
, ExecDirectoryType
);
7256 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
7257 * the service payload in. */
7258 static const char* const exec_directory_env_name_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
7259 [EXEC_DIRECTORY_RUNTIME
] = "RUNTIME_DIRECTORY",
7260 [EXEC_DIRECTORY_STATE
] = "STATE_DIRECTORY",
7261 [EXEC_DIRECTORY_CACHE
] = "CACHE_DIRECTORY",
7262 [EXEC_DIRECTORY_LOGS
] = "LOGS_DIRECTORY",
7263 [EXEC_DIRECTORY_CONFIGURATION
] = "CONFIGURATION_DIRECTORY",
7266 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name
, ExecDirectoryType
);
7268 static const char* const exec_keyring_mode_table
[_EXEC_KEYRING_MODE_MAX
] = {
7269 [EXEC_KEYRING_INHERIT
] = "inherit",
7270 [EXEC_KEYRING_PRIVATE
] = "private",
7271 [EXEC_KEYRING_SHARED
] = "shared",
7274 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode
, ExecKeyringMode
);