1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
8 #include <sys/personality.h>
14 #if HAVE_VALGRIND_VALGRIND_H
15 #include <valgrind/valgrind.h>
18 #include "sd-messages.h"
20 #include "alloc-util.h"
21 #include "architecture.h"
22 #include "argv-util.h"
23 #include "cgroup-util.h"
24 #include "dirent-util.h"
26 #include "errno-util.h"
31 #include "hostname-util.h"
33 #include "iovec-util.h"
34 #include "locale-util.h"
36 #include "memory-util.h"
37 #include "missing_syscall.h"
38 #include "mountpoint-util.h"
39 #include "namespace-util.h"
40 #include "nulstr-util.h"
41 #include "parse-util.h"
42 #include "path-util.h"
43 #include "pidfd-util.h"
45 #include "process-util.h"
46 #include "raw-clone.h"
47 #include "rlimit-util.h"
48 #include "signal-util.h"
49 #include "socket-util.h"
50 #include "stat-util.h"
51 #include "stdio-util.h"
52 #include "string-table.h"
53 #include "string-util.h"
54 #include "time-util.h"
55 #include "user-util.h"
57 /* The kernel limits userspace processes to TASK_COMM_LEN (16 bytes), but allows higher values for its own
58 * workers, e.g. "kworker/u9:3-kcryptd/253:0". Let's pick a fixed smallish limit that will work for the kernel.
60 #define COMM_MAX_LEN 128
62 static int get_process_state(pid_t pid
) {
63 _cleanup_free_
char *line
= NULL
;
70 /* Shortcut: if we are enquired about our own state, we are obviously running */
71 if (pid
== 0 || pid
== getpid_cached())
72 return (unsigned char) 'R';
74 p
= procfs_file_alloca(pid
, "stat");
76 r
= read_one_line_file(p
, &line
);
82 p
= strrchr(line
, ')');
88 if (sscanf(p
, " %c", &state
) != 1)
91 return (unsigned char) state
;
94 int pid_get_comm(pid_t pid
, char **ret
) {
95 _cleanup_free_
char *escaped
= NULL
, *comm
= NULL
;
101 if (pid
== 0 || pid
== getpid_cached()) {
102 comm
= new0(char, TASK_COMM_LEN
+ 1); /* Must fit in 16 byte according to prctl(2) */
106 if (prctl(PR_GET_NAME
, comm
) < 0)
111 p
= procfs_file_alloca(pid
, "comm");
113 /* Note that process names of kernel threads can be much longer than TASK_COMM_LEN */
114 r
= read_one_line_file(p
, &comm
);
121 escaped
= new(char, COMM_MAX_LEN
);
125 /* Escape unprintable characters, just in case, but don't grow the string beyond the underlying size */
126 cellescape(escaped
, COMM_MAX_LEN
, comm
);
128 *ret
= TAKE_PTR(escaped
);
132 int pidref_get_comm(const PidRef
*pid
, char **ret
) {
133 _cleanup_free_
char *comm
= NULL
;
136 if (!pidref_is_set(pid
))
139 if (pidref_is_remote(pid
))
142 r
= pid_get_comm(pid
->pid
, &comm
);
146 r
= pidref_verify(pid
);
151 *ret
= TAKE_PTR(comm
);
155 static int pid_get_cmdline_nulstr(
158 ProcessCmdlineFlags flags
,
162 _cleanup_free_
char *t
= NULL
;
167 /* Retrieves a process' command line as a "sized nulstr", i.e. possibly without the last NUL, but
168 * with a specified size.
170 * If PROCESS_CMDLINE_COMM_FALLBACK is specified in flags and the process has no command line set
171 * (the case for kernel threads), or has a command line that resolves to the empty string, will
172 * return the "comm" name of the process instead. This will use at most _SC_ARG_MAX bytes of input
175 * Returns an error, 0 if output was read but is truncated, 1 otherwise.
178 p
= procfs_file_alloca(pid
, "cmdline");
179 r
= read_virtual_file(p
, max_size
, &t
, &k
); /* Let's assume that each input byte results in >= 1
180 * columns of output. We ignore zero-width codepoints. */
187 if (!(flags
& PROCESS_CMDLINE_COMM_FALLBACK
))
190 /* Kernel threads have no argv[] */
191 _cleanup_free_
char *comm
= NULL
;
193 r
= pid_get_comm(pid
, &comm
);
198 t
= strjoin("[", comm
, "]");
204 if (r
== 0) /* truncation */
216 int pid_get_cmdline(pid_t pid
, size_t max_columns
, ProcessCmdlineFlags flags
, char **ret
) {
217 _cleanup_free_
char *t
= NULL
;
224 /* Retrieve and format a command line. See above for discussion of retrieval options.
226 * There are two main formatting modes:
228 * - when PROCESS_CMDLINE_QUOTE is specified, output is quoted in C/Python style. If no shell special
229 * characters are present, this output can be copy-pasted into the terminal to execute. UTF-8
232 * - otherwise, a compact non-roundtrippable form is returned. Non-UTF8 bytes are replaced by �. The
233 * returned string is of the specified console width at most, abbreviated with an ellipsis.
235 * Returns -ESRCH if the process doesn't exist, and -ENOENT if the process has no command line (and
236 * PROCESS_CMDLINE_COMM_FALLBACK is not specified). Returns 0 and sets *line otherwise. */
238 int full
= pid_get_cmdline_nulstr(pid
, max_columns
, flags
, &t
, &k
);
242 if (flags
& (PROCESS_CMDLINE_QUOTE
| PROCESS_CMDLINE_QUOTE_POSIX
)) {
243 ShellEscapeFlags shflags
= SHELL_ESCAPE_EMPTY
|
244 FLAGS_SET(flags
, PROCESS_CMDLINE_QUOTE_POSIX
) * SHELL_ESCAPE_POSIX
;
246 assert(!(flags
& PROCESS_CMDLINE_USE_LOCALE
));
248 _cleanup_strv_free_
char **args
= NULL
;
250 /* Drop trailing NULs, otherwise strv_parse_nulstr() adds additional empty strings at the end.
251 * See also issue #21186. */
252 args
= strv_parse_nulstr_full(t
, k
, /* drop_trailing_nuls = */ true);
256 ans
= quote_command_line(args
, shflags
);
260 /* Arguments are separated by NULs. Let's replace those with spaces. */
261 for (size_t i
= 0; i
< k
- 1; i
++)
265 delete_trailing_chars(t
, WHITESPACE
);
267 bool eight_bit
= (flags
& PROCESS_CMDLINE_USE_LOCALE
) && !is_locale_utf8();
269 ans
= escape_non_printable_full(t
, max_columns
,
270 eight_bit
* XESCAPE_8_BIT
| !full
* XESCAPE_FORCE_ELLIPSIS
);
274 ans
= str_realloc(ans
);
281 int pidref_get_cmdline(const PidRef
*pid
, size_t max_columns
, ProcessCmdlineFlags flags
, char **ret
) {
282 _cleanup_free_
char *s
= NULL
;
285 if (!pidref_is_set(pid
))
288 if (pidref_is_remote(pid
))
291 r
= pid_get_cmdline(pid
->pid
, max_columns
, flags
, &s
);
295 r
= pidref_verify(pid
);
304 int pid_get_cmdline_strv(pid_t pid
, ProcessCmdlineFlags flags
, char ***ret
) {
305 _cleanup_free_
char *t
= NULL
;
311 assert((flags
& ~PROCESS_CMDLINE_COMM_FALLBACK
) == 0);
314 r
= pid_get_cmdline_nulstr(pid
, SIZE_MAX
, flags
, &t
, &k
);
318 args
= strv_parse_nulstr_full(t
, k
, /* drop_trailing_nuls = */ true);
326 int pidref_get_cmdline_strv(const PidRef
*pid
, ProcessCmdlineFlags flags
, char ***ret
) {
327 _cleanup_strv_free_
char **args
= NULL
;
330 if (!pidref_is_set(pid
))
333 if (pidref_is_remote(pid
))
336 r
= pid_get_cmdline_strv(pid
->pid
, flags
, &args
);
340 r
= pidref_verify(pid
);
345 *ret
= TAKE_PTR(args
);
350 int container_get_leader(const char *machine
, pid_t
*pid
) {
351 _cleanup_free_
char *s
= NULL
, *class = NULL
;
359 if (streq(machine
, ".host")) {
364 if (!hostname_is_valid(machine
, 0))
367 p
= strjoina("/run/systemd/machines/", machine
);
368 r
= parse_env_file(NULL
, p
,
378 if (!streq_ptr(class, "container"))
381 r
= parse_pid(s
, &leader
);
391 int pid_is_kernel_thread(pid_t pid
) {
392 _cleanup_free_
char *line
= NULL
;
393 unsigned long long flags
;
399 if (IN_SET(pid
, 0, 1) || pid
== getpid_cached()) /* pid 1, and we ourselves certainly aren't a kernel thread */
401 if (!pid_is_valid(pid
))
404 p
= procfs_file_alloca(pid
, "stat");
405 r
= read_one_line_file(p
, &line
);
411 /* Skip past the comm field */
412 q
= strrchr(line
, ')');
417 /* Skip 6 fields to reach the flags field */
418 for (i
= 0; i
< 6; i
++) {
419 l
= strspn(q
, WHITESPACE
);
424 l
= strcspn(q
, WHITESPACE
);
430 /* Skip preceding whitespace */
431 l
= strspn(q
, WHITESPACE
);
436 /* Truncate the rest */
437 l
= strcspn(q
, WHITESPACE
);
442 r
= safe_atollu(q
, &flags
);
446 return !!(flags
& PF_KTHREAD
);
449 int pidref_is_kernel_thread(const PidRef
*pid
) {
452 if (!pidref_is_set(pid
))
455 if (pidref_is_remote(pid
))
458 result
= pid_is_kernel_thread(pid
->pid
);
462 r
= pidref_verify(pid
); /* Verify that the PID wasn't reused since */
469 static int get_process_link_contents(pid_t pid
, const char *proc_file
, char **ret
) {
475 p
= procfs_file_alloca(pid
, proc_file
);
477 r
= readlink_malloc(p
, ret
);
478 return (r
== -ENOENT
&& proc_mounted() > 0) ? -ESRCH
: r
;
481 int get_process_exe(pid_t pid
, char **ret
) {
487 r
= get_process_link_contents(pid
, "exe", ret
);
492 d
= endswith(*ret
, " (deleted)");
500 int pid_get_uid(pid_t pid
, uid_t
*ret
) {
506 if (pid
== 0 || pid
== getpid_cached()) {
511 _cleanup_free_
char *v
= NULL
;
512 r
= procfs_file_get_field(pid
, "status", "Uid", &v
);
518 return parse_uid(v
, ret
);
521 int pidref_get_uid(const PidRef
*pid
, uid_t
*ret
) {
524 if (!pidref_is_set(pid
))
527 if (pidref_is_remote(pid
))
531 r
= pidfd_get_uid(pid
->fd
, ret
);
532 if (!ERRNO_IS_NEG_NOT_SUPPORTED(r
))
537 r
= pid_get_uid(pid
->pid
, &uid
);
541 r
= pidref_verify(pid
);
550 int get_process_gid(pid_t pid
, gid_t
*ret
) {
556 if (pid
== 0 || pid
== getpid_cached()) {
561 _cleanup_free_
char *v
= NULL
;
562 r
= procfs_file_get_field(pid
, "status", "Gid", &v
);
568 return parse_gid(v
, ret
);
571 int get_process_cwd(pid_t pid
, char **ret
) {
574 if (pid
== 0 || pid
== getpid_cached())
575 return safe_getcwd(ret
);
577 return get_process_link_contents(pid
, "cwd", ret
);
580 int get_process_root(pid_t pid
, char **ret
) {
582 return get_process_link_contents(pid
, "root", ret
);
585 #define ENVIRONMENT_BLOCK_MAX (5U*1024U*1024U)
587 int get_process_environ(pid_t pid
, char **ret
) {
588 _cleanup_fclose_
FILE *f
= NULL
;
589 _cleanup_free_
char *outcome
= NULL
;
597 p
= procfs_file_alloca(pid
, "environ");
599 r
= fopen_unlocked(p
, "re", &f
);
608 if (sz
>= ENVIRONMENT_BLOCK_MAX
)
611 if (!GREEDY_REALLOC(outcome
, sz
+ 5))
614 r
= safe_fgetc(f
, &c
);
621 outcome
[sz
++] = '\n';
623 sz
+= cescape_char(c
, outcome
+ sz
);
627 *ret
= TAKE_PTR(outcome
);
632 int pid_get_ppid(pid_t pid
, pid_t
*ret
) {
633 _cleanup_free_
char *line
= NULL
;
641 pid
= getpid_cached();
642 if (pid
== 1) /* PID 1 has no parent, shortcut this case */
643 return -EADDRNOTAVAIL
;
645 if (pid
== getpid_cached()) {
651 p
= procfs_file_alloca(pid
, "stat");
652 r
= read_one_line_file(p
, &line
);
658 /* Let's skip the pid and comm fields. The latter is enclosed in () but does not escape any () in its
659 * value, so let's skip over it manually */
661 p
= strrchr(line
, ')');
672 /* If ppid is zero the process has no parent. Which might be the case for PID 1 (caught above)
673 * but also for processes originating in other namespaces that are inserted into a pidns.
674 * Return a recognizable error in this case. */
676 return -EADDRNOTAVAIL
;
678 if ((pid_t
) ppid
< 0 || (unsigned long) (pid_t
) ppid
!= ppid
)
687 int pidref_get_ppid(const PidRef
*pidref
, pid_t
*ret
) {
690 if (!pidref_is_set(pidref
))
693 if (pidref_is_remote(pidref
))
696 if (pidref
->fd
>= 0) {
697 r
= pidfd_get_ppid(pidref
->fd
, ret
);
698 if (!ERRNO_IS_NEG_NOT_SUPPORTED(r
))
703 r
= pid_get_ppid(pidref
->pid
, ret
? &ppid
: NULL
);
707 r
= pidref_verify(pidref
);
716 int pidref_get_ppid_as_pidref(const PidRef
*pidref
, PidRef
*ret
) {
722 r
= pidref_get_ppid(pidref
, &ppid
);
726 for (unsigned attempt
= 0; attempt
< 16; attempt
++) {
727 _cleanup_(pidref_done
) PidRef parent
= PIDREF_NULL
;
729 r
= pidref_set_pid(&parent
, ppid
);
733 /* If we have a pidfd of the original PID, let's verify that the process we acquired really
734 * is the parent still */
735 if (pidref
->fd
>= 0) {
736 r
= pidref_get_ppid(pidref
, &ppid
);
740 /* Did the PPID change since we queried it? if so we might have pinned the wrong
741 * process, if its PID got reused by now. Let's try again */
742 if (parent
.pid
!= ppid
)
746 *ret
= TAKE_PIDREF(parent
);
750 /* Give up after 16 tries */
751 return -ENOTRECOVERABLE
;
754 int pid_get_start_time(pid_t pid
, usec_t
*ret
) {
755 _cleanup_free_
char *line
= NULL
;
761 p
= procfs_file_alloca(pid
, "stat");
762 r
= read_one_line_file(p
, &line
);
768 /* Let's skip the pid and comm fields. The latter is enclosed in () but does not escape any () in its
769 * value, so let's skip over it manually */
771 p
= strrchr(line
, ')');
794 "%*i " /* priority */
796 "%*u " /* num_threads */
797 "%*u " /* itrealvalue */
798 "%lu ", /* starttime */
803 *ret
= jiffies_to_usec(llu
); /* CLOCK_BOOTTIME */
808 int pidref_get_start_time(const PidRef
*pid
, usec_t
*ret
) {
812 if (!pidref_is_set(pid
))
815 if (pidref_is_remote(pid
))
818 r
= pid_get_start_time(pid
->pid
, ret
? &t
: NULL
);
822 r
= pidref_verify(pid
);
832 int get_process_umask(pid_t pid
, mode_t
*ret
) {
833 _cleanup_free_
char *m
= NULL
;
839 r
= procfs_file_get_field(pid
, "status", "Umask", &m
);
845 return parse_mode(m
, ret
);
848 int wait_for_terminate(pid_t pid
, siginfo_t
*ret
) {
849 return pidref_wait_for_terminate(&PIDREF_MAKE_FROM_PID(pid
), ret
);
854 * < 0 : wait_for_terminate() failed to get the state of the
855 * process, the process was terminated by a signal, or
856 * failed for an unknown reason.
857 * >=0 : The process terminated normally, and its exit code is
860 * That is, success is indicated by a return value of zero, and an
861 * error is indicated by a non-zero value.
863 * A warning is emitted if the process terminates abnormally,
864 * and also if it returns non-zero unless check_exit_code is true.
866 int pidref_wait_for_terminate_and_check(const char *name
, PidRef
*pidref
, WaitFlags flags
) {
869 if (!pidref_is_set(pidref
))
871 if (pidref_is_remote(pidref
))
873 if (pidref
->pid
== 1 || pidref_is_self(pidref
))
876 _cleanup_free_
char *buffer
= NULL
;
878 r
= pidref_get_comm(pidref
, &buffer
);
880 log_debug_errno(r
, "Failed to acquire process name of " PID_FMT
", ignoring: %m", pidref
->pid
);
885 int prio
= flags
& WAIT_LOG_ABNORMAL
? LOG_ERR
: LOG_DEBUG
;
888 r
= pidref_wait_for_terminate(pidref
, &status
);
890 return log_full_errno(prio
, r
, "Failed to wait for %s: %m", strna(name
));
892 if (status
.si_code
== CLD_EXITED
) {
893 if (status
.si_status
!= EXIT_SUCCESS
)
894 log_full(flags
& WAIT_LOG_NON_ZERO_EXIT_STATUS
? LOG_ERR
: LOG_DEBUG
,
895 "%s failed with exit status %i.", strna(name
), status
.si_status
);
897 log_debug("%s succeeded.", name
);
899 return status
.si_status
;
901 } else if (IN_SET(status
.si_code
, CLD_KILLED
, CLD_DUMPED
)) {
903 log_full(prio
, "%s terminated by signal %s.", strna(name
), signal_to_string(status
.si_status
));
907 log_full(prio
, "%s failed due to unknown reason.", strna(name
));
911 int wait_for_terminate_and_check(const char *name
, pid_t pid
, WaitFlags flags
) {
912 return pidref_wait_for_terminate_and_check(name
, &PIDREF_MAKE_FROM_PID(pid
), flags
);
918 * < 0 : wait_for_terminate_with_timeout() failed to get the state of the process, the process timed out, the process
919 * was terminated by a signal, or failed for an unknown reason.
921 * >=0 : The process terminated normally with no failures.
923 * Success is indicated by a return value of zero, a timeout is indicated by ETIMEDOUT, and all other child failure
924 * states are indicated by error is indicated by a non-zero value.
926 * This call assumes SIGCHLD has been blocked already, in particular before the child to wait for has been forked off
927 * to remain entirely race-free.
929 int wait_for_terminate_with_timeout(pid_t pid
, usec_t timeout
) {
934 assert_se(sigemptyset(&mask
) == 0);
935 assert_se(sigaddset(&mask
, SIGCHLD
) == 0);
937 /* Drop into a sigtimewait-based timeout. Waiting for the
939 until
= usec_add(now(CLOCK_MONOTONIC
), timeout
);
942 siginfo_t status
= {};
944 n
= now(CLOCK_MONOTONIC
);
948 r
= RET_NERRNO(sigtimedwait(&mask
, NULL
, TIMESPEC_STORE(until
- n
)));
949 /* Assuming we woke due to the child exiting. */
950 if (waitid(P_PID
, pid
, &status
, WEXITED
|WNOHANG
) == 0) {
951 if (status
.si_pid
== pid
) {
952 /* This is the correct child. */
953 if (status
.si_code
== CLD_EXITED
)
954 return status
.si_status
== 0 ? 0 : -EPROTO
;
959 /* Not the child, check for errors and proceed appropriately */
963 /* Timed out, child is likely hung. */
966 /* Received a different signal and should retry */
969 /* Return any unexpected errors */
978 void sigkill_wait(pid_t pid
) {
981 (void) kill(pid
, SIGKILL
);
982 (void) wait_for_terminate(pid
, NULL
);
985 void sigkill_waitp(pid_t
*pid
) {
996 void sigterm_wait(pid_t pid
) {
999 (void) kill_and_sigcont(pid
, SIGTERM
);
1000 (void) wait_for_terminate(pid
, NULL
);
1003 void sigkill_nowait(pid_t pid
) {
1006 (void) kill(pid
, SIGKILL
);
1009 void sigkill_nowaitp(pid_t
*pid
) {
1017 sigkill_nowait(*pid
);
1020 int kill_and_sigcont(pid_t pid
, int sig
) {
1023 r
= RET_NERRNO(kill(pid
, sig
));
1025 /* If this worked, also send SIGCONT, unless we already just sent a SIGCONT, or SIGKILL was sent which isn't
1026 * affected by a process being suspended anyway. */
1027 if (r
>= 0 && !IN_SET(sig
, SIGCONT
, SIGKILL
))
1028 (void) kill(pid
, SIGCONT
);
1033 int getenv_for_pid(pid_t pid
, const char *field
, char **ret
) {
1034 _cleanup_fclose_
FILE *f
= NULL
;
1043 if (pid
== 0 || pid
== getpid_cached())
1044 return strdup_to_full(ret
, getenv(field
));
1046 if (!pid_is_valid(pid
))
1049 path
= procfs_file_alloca(pid
, "environ");
1051 r
= fopen_unlocked(path
, "re", &f
);
1058 _cleanup_free_
char *line
= NULL
;
1061 if (sum
> ENVIRONMENT_BLOCK_MAX
) /* Give up searching eventually */
1064 r
= read_nul_string(f
, LONG_LINE_MAX
, &line
);
1067 if (r
== 0) /* EOF */
1072 match
= startswith(line
, field
);
1073 if (match
&& *match
== '=')
1074 return strdup_to_full(ret
, match
+ 1);
1081 int pidref_is_my_child(PidRef
*pid
) {
1084 if (!pidref_is_set(pid
))
1087 if (pidref_is_remote(pid
))
1090 if (pid
->pid
== 1 || pidref_is_self(pid
))
1094 r
= pidref_get_ppid(pid
, &ppid
);
1095 if (r
== -EADDRNOTAVAIL
) /* if this process is outside of our pidns, it is definitely not our child */
1100 return ppid
== getpid_cached();
1103 int pid_is_my_child(pid_t pid
) {
1108 return pidref_is_my_child(&PIDREF_MAKE_FROM_PID(pid
));
1111 int pidref_is_unwaited(PidRef
*pid
) {
1114 /* Checks whether a PID is still valid at all, including a zombie */
1116 if (!pidref_is_set(pid
))
1119 if (pidref_is_remote(pid
))
1122 if (pid
->pid
== 1 || pidref_is_self(pid
))
1125 r
= pidref_kill(pid
, 0);
1134 int pid_is_unwaited(pid_t pid
) {
1139 return pidref_is_unwaited(&PIDREF_MAKE_FROM_PID(pid
));
1142 int pid_is_alive(pid_t pid
) {
1145 /* Checks whether a PID is still valid and not a zombie */
1150 if (pid
<= 1) /* If we or PID 1 would be a zombie, this code would not be running */
1153 if (pid
== getpid_cached())
1156 r
= get_process_state(pid
);
1165 int pidref_is_alive(const PidRef
*pidref
) {
1168 if (!pidref_is_set(pidref
))
1171 if (pidref_is_remote(pidref
))
1174 result
= pid_is_alive(pidref
->pid
);
1176 assert(result
!= -ESRCH
);
1180 r
= pidref_verify(pidref
);
1189 int pidref_from_same_root_fs(PidRef
*a
, PidRef
*b
) {
1190 _cleanup_(pidref_done
) PidRef self
= PIDREF_NULL
;
1193 /* Checks if the two specified processes have the same root fs. Either can be specified as NULL in
1194 * which case we'll check against ourselves. */
1197 r
= pidref_set_self(&self
);
1206 if (!pidref_is_set(a
) || !pidref_is_set(b
))
1209 /* If one of the two processes have the same root they cannot have the same root fs, but if both of
1210 * them do we don't know */
1211 if (pidref_is_remote(a
) && pidref_is_remote(b
))
1213 if (pidref_is_remote(a
) || pidref_is_remote(b
))
1216 if (pidref_equal(a
, b
))
1219 const char *roota
= procfs_file_alloca(a
->pid
, "root");
1220 const char *rootb
= procfs_file_alloca(b
->pid
, "root");
1222 int result
= inode_same(roota
, rootb
, 0);
1223 if (result
== -ENOENT
)
1224 return proc_mounted() == 0 ? -ENOSYS
: -ESRCH
;
1228 r
= pidref_verify(a
);
1231 r
= pidref_verify(b
);
1238 bool is_main_thread(void) {
1239 static thread_local
int cached
= -1;
1242 cached
= getpid_cached() == gettid();
1247 bool oom_score_adjust_is_valid(int oa
) {
1248 return oa
>= OOM_SCORE_ADJ_MIN
&& oa
<= OOM_SCORE_ADJ_MAX
;
1251 unsigned long personality_from_string(const char *p
) {
1252 Architecture architecture
;
1255 return PERSONALITY_INVALID
;
1257 /* Parse a personality specifier. We use our own identifiers that indicate specific ABIs, rather than just
1258 * hints regarding the register size, since we want to keep things open for multiple locally supported ABIs for
1259 * the same register size. */
1261 architecture
= architecture_from_string(p
);
1262 if (architecture
< 0)
1263 return PERSONALITY_INVALID
;
1265 if (architecture
== native_architecture())
1267 #ifdef ARCHITECTURE_SECONDARY
1268 if (architecture
== ARCHITECTURE_SECONDARY
)
1272 return PERSONALITY_INVALID
;
1275 const char* personality_to_string(unsigned long p
) {
1276 Architecture architecture
= _ARCHITECTURE_INVALID
;
1279 architecture
= native_architecture();
1280 #ifdef ARCHITECTURE_SECONDARY
1281 else if (p
== PER_LINUX32
)
1282 architecture
= ARCHITECTURE_SECONDARY
;
1285 if (architecture
< 0)
1288 return architecture_to_string(architecture
);
1291 int safe_personality(unsigned long p
) {
1294 /* So here's the deal, personality() is weirdly defined by glibc. In some cases it returns a failure via errno,
1295 * and in others as negative return value containing an errno-like value. Let's work around this: this is a
1296 * wrapper that uses errno if it is set, and uses the return value otherwise. And then it sets both errno and
1297 * the return value indicating the same issue, so that we are definitely on the safe side.
1299 * See https://github.com/systemd/systemd/issues/6737 */
1302 ret
= personality(p
);
1313 int opinionated_personality(unsigned long *ret
) {
1316 /* Returns the current personality, or PERSONALITY_INVALID if we can't determine it. This function is a bit
1317 * opinionated though, and ignores all the finer-grained bits and exotic personalities, only distinguishing the
1318 * two most relevant personalities: PER_LINUX and PER_LINUX32. */
1320 current
= safe_personality(PERSONALITY_INVALID
);
1324 if (((unsigned long) current
& OPINIONATED_PERSONALITY_MASK
) == PER_LINUX32
)
1332 void valgrind_summary_hack(void) {
1333 #if HAVE_VALGRIND_VALGRIND_H
1334 if (getpid_cached() == 1 && RUNNING_ON_VALGRIND
) {
1336 pid
= raw_clone(SIGCHLD
);
1340 LOG_MESSAGE_ID(SD_MESSAGE_VALGRIND_HELPER_FORK_STR
),
1341 LOG_MESSAGE("Failed to fork off valgrind helper: %m"));
1345 log_info("Spawned valgrind helper as PID "PID_FMT
".", pid
);
1346 (void) wait_for_terminate(pid
, NULL
);
1352 int pid_compare_func(const pid_t
*a
, const pid_t
*b
) {
1353 /* Suitable for usage in qsort() */
1357 bool nice_is_valid(int n
) {
1358 return n
>= PRIO_MIN
&& n
< PRIO_MAX
;
1361 bool sched_policy_is_valid(int i
) {
1362 return IN_SET(i
, SCHED_OTHER
, SCHED_BATCH
, SCHED_IDLE
, SCHED_FIFO
, SCHED_RR
);
1365 bool sched_priority_is_valid(int i
) {
1366 return i
>= 0 && i
<= sched_get_priority_max(SCHED_RR
);
1369 /* The cached PID, possible values:
1371 * == UNSET [0] → cache not initialized yet
1372 * == BUSY [-1] → some thread is initializing it at the moment
1373 * any other → the cached PID
1376 #define CACHED_PID_UNSET ((pid_t) 0)
1377 #define CACHED_PID_BUSY ((pid_t) -1)
1379 static pid_t cached_pid
= CACHED_PID_UNSET
;
1381 void reset_cached_pid(void) {
1382 /* Invoked in the child after a fork(), i.e. at the first moment the PID changed */
1383 cached_pid
= CACHED_PID_UNSET
;
1386 pid_t
getpid_cached(void) {
1387 static bool installed
= false;
1388 pid_t current_value
= CACHED_PID_UNSET
;
1390 /* getpid_cached() is much like getpid(), but caches the value in local memory, to avoid having to invoke a
1391 * system call each time. This restores glibc behaviour from before 2.24, when getpid() was unconditionally
1392 * cached. Starting with 2.24 getpid() started to become prohibitively expensive when used for detecting when
1393 * objects were used across fork()s. With this caching the old behaviour is somewhat restored.
1395 * https://bugzilla.redhat.com/show_bug.cgi?id=1443976
1396 * https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=c579f48edba88380635ab98cb612030e3ed8691e
1399 (void) __atomic_compare_exchange_n(
1407 switch (current_value
) {
1409 case CACHED_PID_UNSET
: { /* Not initialized yet, then do so now */
1415 /* __register_atfork() either returns 0 or -ENOMEM, in its glibc implementation. Since it's
1416 * only half-documented (glibc doesn't document it but LSB does — though only superficially)
1417 * we'll check for errors only in the most generic fashion possible. */
1419 if (pthread_atfork(NULL
, NULL
, reset_cached_pid
) != 0) {
1420 /* OOM? Let's try again later */
1421 cached_pid
= CACHED_PID_UNSET
;
1428 cached_pid
= new_pid
;
1432 case CACHED_PID_BUSY
: /* Somebody else is currently initializing */
1435 default: /* Properly initialized */
1436 return current_value
;
1440 int must_be_root(void) {
1445 return log_error_errno(SYNTHETIC_ERRNO(EPERM
), "Need to be root.");
1448 pid_t
clone_with_nested_stack(int (*fn
)(void *), int flags
, void *userdata
) {
1453 /* A wrapper around glibc's clone() call that automatically sets up a "nested" stack. Only supports
1454 * invocations without CLONE_VM, so that we can continue to use the parent's stack mapping.
1456 * Note: glibc's clone() wrapper does not synchronize malloc() locks. This means that if the parent
1457 * is threaded these locks will be in an undefined state in the child, and hence memory allocations
1458 * are likely going to run into deadlocks. Hence: if you use this function make sure your parent is
1459 * strictly single-threaded or your child never calls malloc(). */
1461 assert((flags
& (CLONE_VM
|CLONE_PARENT_SETTID
|CLONE_CHILD_SETTID
|
1462 CLONE_CHILD_CLEARTID
|CLONE_SETTLS
)) == 0);
1464 /* We allocate some space on the stack to use as the stack for the child (hence "nested"). Note that
1465 * the net effect is that the child will have the start of its stack inside the stack of the parent,
1466 * but since they are a CoW copy of each other that's fine. We allocate one page-aligned page. But
1467 * since we don't want to deal with differences between systems where the stack grows backwards or
1468 * forwards we'll allocate one more and place the stack address in the middle. Except that we also
1469 * want it page aligned, hence we'll allocate one page more. Makes 3. */
1472 mystack
= alloca(ps
*3);
1473 mystack
= (uint8_t*) mystack
+ ps
; /* move pointer one page ahead since stacks usually grow backwards */
1474 mystack
= (void*) ALIGN_TO((uintptr_t) mystack
, ps
); /* align to page size (moving things further ahead) */
1477 pid
= clone(fn
, mystack
, flags
, userdata
);
1479 pid
= __clone2(fn
, mystack
, ps
, flags
, userdata
);
1487 static void restore_sigsetp(sigset_t
**ssp
) {
1489 (void) sigprocmask(SIG_SETMASK
, *ssp
, NULL
);
1492 static int fork_flags_to_signal(ForkFlags flags
) {
1493 return (flags
& FORK_DEATHSIG_SIGTERM
) ? SIGTERM
:
1494 (flags
& FORK_DEATHSIG_SIGINT
) ? SIGINT
:
1498 int pidref_safe_fork_full(
1500 const int stdio_fds
[3],
1502 size_t n_except_fds
,
1506 pid_t original_pid
, pid
;
1507 sigset_t saved_ss
, ss
;
1508 _unused_
_cleanup_(restore_sigsetp
) sigset_t
*saved_ssp
= NULL
;
1509 bool block_signals
= false, block_all
= false, intermediary
= false;
1510 _cleanup_close_pair_
int pidref_transport_fds
[2] = EBADF_PAIR
;
1513 assert(!FLAGS_SET(flags
, FORK_WAIT
|FORK_FREEZE
));
1514 assert(!FLAGS_SET(flags
, FORK_DETACH
) ||
1515 (flags
& (FORK_WAIT
|FORK_DEATHSIG_SIGTERM
|FORK_DEATHSIG_SIGINT
|FORK_DEATHSIG_SIGKILL
)) == 0);
1517 /* A wrapper around fork(), that does a couple of important initializations in addition to mere
1518 * forking. If provided, ret_pid is initialized in both the parent and the child process, both times
1519 * referencing the child process. Returns == 0 in the child and > 0 in the parent. */
1521 prio
= flags
& FORK_LOG
? LOG_ERR
: LOG_DEBUG
;
1523 original_pid
= getpid_cached();
1525 if (flags
& FORK_FLUSH_STDIO
) {
1527 fflush(stderr
); /* This one shouldn't be necessary, stderr should be unbuffered anyway, but let's better be safe than sorry */
1530 if (flags
& (FORK_RESET_SIGNALS
|FORK_DEATHSIG_SIGTERM
|FORK_DEATHSIG_SIGINT
)) {
1531 /* We temporarily block all signals, so that the new child has them blocked initially. This
1532 * way, we can be sure that SIGTERMs are not lost we might send to the child. (Note that for
1533 * FORK_DEATHSIG_SIGKILL we don't bother, since it cannot be blocked anyway.) */
1535 assert_se(sigfillset(&ss
) >= 0);
1536 block_signals
= block_all
= true;
1538 } else if (flags
& FORK_WAIT
) {
1539 /* Let's block SIGCHLD at least, so that we can safely watch for the child process */
1541 assert_se(sigemptyset(&ss
) >= 0);
1542 assert_se(sigaddset(&ss
, SIGCHLD
) >= 0);
1543 block_signals
= true;
1546 if (block_signals
) {
1547 if (sigprocmask(SIG_BLOCK
, &ss
, &saved_ss
) < 0)
1548 return log_full_errno(prio
, errno
, "Failed to block signal mask: %m");
1549 saved_ssp
= &saved_ss
;
1552 if (FLAGS_SET(flags
, FORK_DETACH
)) {
1553 /* Fork off intermediary child if needed */
1555 r
= is_reaper_process();
1557 return log_full_errno(prio
, r
, "Failed to determine if we are a reaper process: %m");
1560 /* Not a reaper process, hence do a double fork() so we are reparented to one */
1562 if (ret_pid
&& socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, pidref_transport_fds
) < 0)
1563 return log_full_errno(prio
, errno
, "Failed to allocate pidref socket: %m");
1567 return log_full_errno(prio
, errno
, "Failed to fork off '%s': %m", strna(name
));
1569 log_debug("Successfully forked off intermediary '%s' as PID " PID_FMT
".", strna(name
), pid
);
1571 pidref_transport_fds
[1] = safe_close(pidref_transport_fds
[1]);
1573 if (pidref_transport_fds
[0] >= 0) {
1574 /* Wait for the intermediary child to exit so the caller can be certain the actual child
1575 * process has been reparented by the time this function returns. */
1576 r
= wait_for_terminate_and_check(name
, pid
, FLAGS_SET(flags
, FORK_LOG
) ? WAIT_LOG
: 0);
1578 return log_full_errno(prio
, r
, "Failed to wait for intermediary process: %m");
1579 if (r
!= EXIT_SUCCESS
) /* exit status > 0 should be treated as failure, too */
1583 ssize_t n
= receive_one_fd_iov(
1584 pidref_transport_fds
[0],
1585 &IOVEC_MAKE(&pid
, sizeof(pid
)),
1590 return log_full_errno(prio
, n
, "Failed to receive child pidref: %m");
1592 *ret_pid
= (PidRef
) { .pid
= pid
, .fd
= pidfd
};
1595 return 1; /* return in the parent */
1598 pidref_transport_fds
[0] = safe_close(pidref_transport_fds
[0]);
1599 intermediary
= true;
1603 if ((flags
& (FORK_NEW_MOUNTNS
|FORK_NEW_USERNS
|FORK_NEW_NETNS
|FORK_NEW_PIDNS
)) != 0)
1604 pid
= raw_clone(SIGCHLD
|
1605 (FLAGS_SET(flags
, FORK_NEW_MOUNTNS
) ? CLONE_NEWNS
: 0) |
1606 (FLAGS_SET(flags
, FORK_NEW_USERNS
) ? CLONE_NEWUSER
: 0) |
1607 (FLAGS_SET(flags
, FORK_NEW_NETNS
) ? CLONE_NEWNET
: 0) |
1608 (FLAGS_SET(flags
, FORK_NEW_PIDNS
) ? CLONE_NEWPID
: 0));
1612 return log_full_errno(prio
, errno
, "Failed to fork off '%s': %m", strna(name
));
1615 /* If we are in the intermediary process, exit now */
1617 if (pidref_transport_fds
[1] >= 0) {
1618 _cleanup_(pidref_done
) PidRef pidref
= PIDREF_NULL
;
1620 r
= pidref_set_pid(&pidref
, pid
);
1622 log_full_errno(prio
, r
, "Failed to open reference to PID "PID_FMT
": %m", pid
);
1623 _exit(EXIT_FAILURE
);
1626 r
= send_one_fd_iov(
1627 pidref_transport_fds
[1],
1629 &IOVEC_MAKE(&pidref
.pid
, sizeof(pidref
.pid
)),
1633 log_full_errno(prio
, r
, "Failed to send child pidref: %m");
1634 _exit(EXIT_FAILURE
);
1638 _exit(EXIT_SUCCESS
);
1641 /* We are in the parent process */
1642 log_debug("Successfully forked off '%s' as PID " PID_FMT
".", strna(name
), pid
);
1644 if (flags
& FORK_WAIT
) {
1646 /* undo everything except SIGCHLD */
1648 assert_se(sigaddset(&ss
, SIGCHLD
) >= 0);
1649 (void) sigprocmask(SIG_SETMASK
, &ss
, NULL
);
1652 r
= wait_for_terminate_and_check(name
, pid
, (flags
& FORK_LOG
? WAIT_LOG
: 0));
1655 if (r
!= EXIT_SUCCESS
) /* exit status > 0 should be treated as failure, too */
1658 /* If we are in the parent and successfully waited, then the process doesn't exist anymore. */
1660 *ret_pid
= PIDREF_NULL
;
1666 if (FLAGS_SET(flags
, FORK_PID_ONLY
))
1667 *ret_pid
= PIDREF_MAKE_FROM_PID(pid
);
1669 r
= pidref_set_pid(ret_pid
, pid
);
1670 if (r
< 0) /* Let's not fail for this, no matter what, the process exists after all, and that's key */
1671 *ret_pid
= PIDREF_MAKE_FROM_PID(pid
);
1678 /* We are in the child process */
1680 pidref_transport_fds
[1] = safe_close(pidref_transport_fds
[1]);
1682 /* Restore signal mask manually */
1685 if (flags
& FORK_REOPEN_LOG
) {
1686 /* Close the logs if requested, before we log anything. And make sure we reopen it if needed. */
1688 log_set_open_when_needed(true);
1689 log_settle_target();
1693 r
= rename_process(name
);
1695 log_full_errno(flags
& FORK_LOG
? LOG_WARNING
: LOG_DEBUG
,
1696 r
, "Failed to rename process, ignoring: %m");
1699 if (flags
& (FORK_DEATHSIG_SIGTERM
|FORK_DEATHSIG_SIGINT
|FORK_DEATHSIG_SIGKILL
))
1700 if (prctl(PR_SET_PDEATHSIG
, fork_flags_to_signal(flags
)) < 0) {
1701 log_full_errno(prio
, errno
, "Failed to set death signal: %m");
1702 _exit(EXIT_FAILURE
);
1705 if (flags
& FORK_RESET_SIGNALS
) {
1706 r
= reset_all_signal_handlers();
1708 log_full_errno(prio
, r
, "Failed to reset signal handlers: %m");
1709 _exit(EXIT_FAILURE
);
1712 /* This implicitly undoes the signal mask stuff we did before the fork()ing above */
1713 r
= reset_signal_mask();
1715 log_full_errno(prio
, r
, "Failed to reset signal mask: %m");
1716 _exit(EXIT_FAILURE
);
1718 } else if (block_signals
) { /* undo what we did above */
1719 if (sigprocmask(SIG_SETMASK
, &saved_ss
, NULL
) < 0) {
1720 log_full_errno(prio
, errno
, "Failed to restore signal mask: %m");
1721 _exit(EXIT_FAILURE
);
1725 if (flags
& (FORK_DEATHSIG_SIGTERM
|FORK_DEATHSIG_SIGKILL
|FORK_DEATHSIG_SIGINT
)) {
1727 /* Let's see if the parent PID is still the one we started from? If not, then the parent
1728 * already died by the time we set PR_SET_PDEATHSIG, hence let's emulate the effect */
1732 /* Parent is in a different PID namespace. */;
1733 else if (ppid
!= original_pid
) {
1734 int sig
= fork_flags_to_signal(flags
);
1735 log_debug("Parent died early, raising %s.", signal_to_string(sig
));
1737 _exit(EXIT_FAILURE
);
1741 if (FLAGS_SET(flags
, FORK_NEW_MOUNTNS
| FORK_MOUNTNS_SLAVE
)) {
1742 /* Optionally, make sure we never propagate mounts to the host. */
1743 if (mount(NULL
, "/", NULL
, MS_SLAVE
| MS_REC
, NULL
) < 0) {
1744 log_full_errno(prio
, errno
, "Failed to remount root directory as MS_SLAVE: %m");
1745 _exit(EXIT_FAILURE
);
1749 if (FLAGS_SET(flags
, FORK_PRIVATE_TMP
)) {
1750 assert(FLAGS_SET(flags
, FORK_NEW_MOUNTNS
));
1752 /* Optionally, overmount new tmpfs instance on /tmp/. */
1753 r
= mount_nofollow("tmpfs", "/tmp", "tmpfs",
1755 "mode=01777" TMPFS_LIMITS_RUN
);
1757 log_full_errno(prio
, r
, "Failed to overmount /tmp/: %m");
1758 _exit(EXIT_FAILURE
);
1762 if (flags
& FORK_REARRANGE_STDIO
) {
1764 r
= rearrange_stdio(stdio_fds
[0], stdio_fds
[1], stdio_fds
[2]);
1766 log_full_errno(prio
, r
, "Failed to rearrange stdio fds: %m");
1767 _exit(EXIT_FAILURE
);
1770 /* Turn off O_NONBLOCK on the fdio fds, in case it was left on */
1771 stdio_disable_nonblock();
1773 r
= make_null_stdio();
1775 log_full_errno(prio
, r
, "Failed to connect stdin/stdout to /dev/null: %m");
1776 _exit(EXIT_FAILURE
);
1779 } else if (flags
& FORK_STDOUT_TO_STDERR
) {
1780 if (dup2(STDERR_FILENO
, STDOUT_FILENO
) < 0) {
1781 log_full_errno(prio
, errno
, "Failed to connect stdout to stderr: %m");
1782 _exit(EXIT_FAILURE
);
1786 if (flags
& FORK_CLOSE_ALL_FDS
) {
1787 /* Close the logs here in case it got reopened above, as close_all_fds() would close them for us */
1790 r
= close_all_fds(except_fds
, n_except_fds
);
1792 log_full_errno(prio
, r
, "Failed to close all file descriptors: %m");
1793 _exit(EXIT_FAILURE
);
1797 if (flags
& FORK_PACK_FDS
) {
1798 /* FORK_CLOSE_ALL_FDS ensures that except_fds are the only FDs >= 3 that are
1799 * open, this is including the log. This is required by pack_fds, which will
1800 * get stuck in an infinite loop of any FDs other than except_fds are open. */
1801 assert(FLAGS_SET(flags
, FORK_CLOSE_ALL_FDS
));
1803 r
= pack_fds(except_fds
, n_except_fds
);
1805 log_full_errno(prio
, r
, "Failed to pack file descriptors: %m");
1806 _exit(EXIT_FAILURE
);
1810 if (flags
& FORK_CLOEXEC_OFF
) {
1811 r
= fd_cloexec_many(except_fds
, n_except_fds
, false);
1813 log_full_errno(prio
, r
, "Failed to turn off O_CLOEXEC on file descriptors: %m");
1814 _exit(EXIT_FAILURE
);
1818 /* When we were asked to reopen the logs, do so again now */
1819 if (flags
& FORK_REOPEN_LOG
) {
1821 log_set_open_when_needed(false);
1824 if (flags
& FORK_RLIMIT_NOFILE_SAFE
) {
1825 r
= rlimit_nofile_safe();
1827 log_full_errno(prio
, r
, "Failed to lower RLIMIT_NOFILE's soft limit to 1K: %m");
1828 _exit(EXIT_FAILURE
);
1832 if (!FLAGS_SET(flags
, FORK_KEEP_NOTIFY_SOCKET
)) {
1833 r
= RET_NERRNO(unsetenv("NOTIFY_SOCKET"));
1835 log_full_errno(prio
, r
, "Failed to unset $NOTIFY_SOCKET: %m");
1836 _exit(EXIT_FAILURE
);
1840 if (FLAGS_SET(flags
, FORK_FREEZE
))
1844 if (FLAGS_SET(flags
, FORK_PID_ONLY
))
1845 *ret_pid
= PIDREF_MAKE_FROM_PID(getpid_cached());
1847 r
= pidref_set_self(ret_pid
);
1849 log_full_errno(prio
, r
, "Failed to acquire PID reference on ourselves: %m");
1850 _exit(EXIT_FAILURE
);
1860 const int stdio_fds
[3],
1862 size_t n_except_fds
,
1866 _cleanup_(pidref_done
) PidRef pidref
= PIDREF_NULL
;
1869 /* Getting the detached child process pid without pidfd is racy, so don't allow it if not returning
1870 * a pidref to the caller. */
1871 assert(!FLAGS_SET(flags
, FORK_DETACH
) || !ret_pid
);
1873 r
= pidref_safe_fork_full(name
, stdio_fds
, except_fds
, n_except_fds
, flags
|FORK_PID_ONLY
, ret_pid
? &pidref
: NULL
);
1874 if (r
< 0 || !ret_pid
)
1877 *ret_pid
= pidref
.pid
;
1883 const char *outer_name
,
1884 const char *inner_name
,
1886 size_t n_except_fds
,
1897 /* This is much like safe_fork(), but forks twice, and joins the specified namespaces in the middle
1898 * process. This ensures that we are fully a member of the destination namespace, with pidns an all, so that
1899 * /proc/self/fd works correctly. */
1901 r
= safe_fork_full(outer_name
,
1903 except_fds
, n_except_fds
,
1904 (flags
|FORK_DEATHSIG_SIGINT
|FORK_DEATHSIG_SIGTERM
|FORK_DEATHSIG_SIGKILL
) & ~(FORK_REOPEN_LOG
|FORK_NEW_MOUNTNS
|FORK_MOUNTNS_SLAVE
), ret_pid
);
1912 r
= namespace_enter(pidns_fd
, mntns_fd
, netns_fd
, userns_fd
, root_fd
);
1914 log_full_errno(FLAGS_SET(flags
, FORK_LOG
) ? LOG_ERR
: LOG_DEBUG
, r
, "Failed to join namespace: %m");
1915 _exit(EXIT_FAILURE
);
1918 /* We mask a few flags here that either make no sense for the grandchild, or that we don't have to do again */
1919 r
= safe_fork_full(inner_name
,
1921 except_fds
, n_except_fds
,
1922 flags
& ~(FORK_WAIT
|FORK_RESET_SIGNALS
|FORK_CLOSE_ALL_FDS
|FORK_REARRANGE_STDIO
), &pid
);
1924 _exit(EXIT_FAILURE
);
1932 r
= wait_for_terminate_and_check(inner_name
, pid
, FLAGS_SET(flags
, FORK_LOG
) ? WAIT_LOG
: 0);
1934 _exit(EXIT_FAILURE
);
1942 int set_oom_score_adjust(int value
) {
1943 char t
[DECIMAL_STR_MAX(int)];
1945 if (!oom_score_adjust_is_valid(value
))
1948 xsprintf(t
, "%i", value
);
1950 return write_string_file("/proc/self/oom_score_adj", t
,
1951 WRITE_STRING_FILE_VERIFY_ON_FAILURE
|WRITE_STRING_FILE_DISABLE_BUFFER
);
1954 int get_oom_score_adjust(int *ret
) {
1955 _cleanup_free_
char *t
= NULL
;
1958 r
= read_virtual_file("/proc/self/oom_score_adj", SIZE_MAX
, &t
, NULL
);
1962 delete_trailing_chars(t
, WHITESPACE
);
1964 r
= safe_atoi(t
, &a
);
1968 if (!oom_score_adjust_is_valid(a
))
1977 static int rlimit_to_nice(rlim_t limit
) {
1979 return PRIO_MAX
-1; /* i.e. 19 */
1981 if (limit
>= -PRIO_MIN
+ PRIO_MAX
)
1982 return PRIO_MIN
; /* i.e. -20 */
1984 return PRIO_MAX
- (int) limit
;
1987 int setpriority_closest(int priority
) {
1988 struct rlimit highest
;
1989 int r
, current
, limit
;
1991 /* Try to set requested nice level */
1992 r
= RET_NERRNO(setpriority(PRIO_PROCESS
, 0, priority
));
1995 if (!ERRNO_IS_NEG_PRIVILEGE(r
))
1999 current
= getpriority(PRIO_PROCESS
, 0);
2003 if (priority
== current
)
2006 /* Hmm, we'd expect that raising the nice level from our status quo would always work. If it doesn't,
2007 * then the whole setpriority() system call is blocked to us, hence let's propagate the error
2009 if (priority
> current
)
2012 if (getrlimit(RLIMIT_NICE
, &highest
) < 0)
2015 limit
= rlimit_to_nice(highest
.rlim_cur
);
2017 /* Push to the allowed limit if we're higher than that. Note that we could also be less nice than
2018 * limit allows us, but still higher than what's requested. In that case our current value is
2019 * the best choice. */
2020 if (current
> limit
)
2021 if (setpriority(PRIO_PROCESS
, 0, limit
) < 0)
2024 log_debug("Cannot set requested nice level (%i), using next best (%i).", priority
, MIN(current
, limit
));
2028 _noreturn_
void freeze(void) {
2031 /* Make sure nobody waits for us (i.e. on one of our sockets) anymore. Note that we use
2032 * close_all_fds_without_malloc() instead of plain close_all_fds() here, since we want this function
2033 * to be compatible with being called from signal handlers. */
2034 (void) close_all_fds_without_malloc(NULL
, 0);
2036 /* Let's not freeze right away, but keep reaping zombies. */
2040 if (waitid(P_ALL
, 0, &si
, WEXITED
) < 0 && errno
!= EINTR
)
2044 /* waitid() failed with an ECHLD error (because there are no left-over child processes) or any other
2045 * (unexpected) error. Freeze for good now! */
2050 int get_process_threads(pid_t pid
) {
2051 _cleanup_free_
char *t
= NULL
;
2057 r
= procfs_file_get_field(pid
, "status", "Threads", &t
);
2063 r
= safe_atoi(t
, &n
);
2072 int is_reaper_process(void) {
2075 /* Checks if we are running in a reaper process, i.e. if we are expected to deal with processes
2076 * reparented to us. This simply checks if we are PID 1 or if PR_SET_CHILD_SUBREAPER was called. */
2078 if (getpid_cached() == 1)
2081 if (prctl(PR_GET_CHILD_SUBREAPER
, (unsigned long) &b
, 0UL, 0UL, 0UL) < 0)
2087 int make_reaper_process(bool b
) {
2089 if (getpid_cached() == 1) {
2097 /* Some prctl()s insist that all 5 arguments are specified, others do not. Let's always specify all,
2098 * to avoid any ambiguities */
2099 if (prctl(PR_SET_CHILD_SUBREAPER
, (unsigned long) b
, 0UL, 0UL, 0UL) < 0)
2105 DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(posix_spawnattr_t
*, posix_spawnattr_destroy
, NULL
);
2107 int posix_spawn_wrapper(
2112 PidRef
*ret_pidref
) {
2114 short flags
= POSIX_SPAWN_SETSIGMASK
;
2115 posix_spawnattr_t attr
;
2119 /* Forks and invokes 'path' with 'argv' and 'envp' using CLONE_VM and CLONE_VFORK, which means the
2120 * caller will be blocked until the child either exits or exec's. The memory of the child will be
2121 * fully shared with the memory of the parent, so that there are no copy-on-write or memory.max
2124 * Also, move the newly-created process into 'cgroup' through POSIX_SPAWN_SETCGROUP (clone3())
2126 * returns 1: We're already in the right cgroup
2127 * 0: 'cgroup' not specified or POSIX_SPAWN_SETCGROUP is not supported. The caller
2128 * needs to call 'cg_attach' on their own */
2134 assert_se(sigfillset(&mask
) >= 0);
2136 r
= posix_spawnattr_init(&attr
);
2138 return -r
; /* These functions return a positive errno on failure */
2140 /* Initialization needs to succeed before we can set up a destructor. */
2141 _unused_
_cleanup_(posix_spawnattr_destroyp
) posix_spawnattr_t
*attr_destructor
= &attr
;
2143 #if HAVE_PIDFD_SPAWN
2144 static bool have_clone_into_cgroup
= true; /* kernel 5.7+ */
2145 _cleanup_close_
int cgroup_fd
= -EBADF
;
2147 if (cgroup
&& have_clone_into_cgroup
) {
2148 _cleanup_free_
char *resolved_cgroup
= NULL
;
2150 r
= cg_get_path_and_check(
2151 SYSTEMD_CGROUP_CONTROLLER
,
2158 cgroup_fd
= open(resolved_cgroup
, O_PATH
|O_DIRECTORY
|O_CLOEXEC
);
2162 r
= posix_spawnattr_setcgroup_np(&attr
, cgroup_fd
);
2166 flags
|= POSIX_SPAWN_SETCGROUP
;
2170 r
= posix_spawnattr_setflags(&attr
, flags
);
2173 r
= posix_spawnattr_setsigmask(&attr
, &mask
);
2177 #if HAVE_PIDFD_SPAWN
2178 _cleanup_close_
int pidfd
= -EBADF
;
2180 r
= pidfd_spawn(&pidfd
, path
, NULL
, &attr
, argv
, envp
);
2181 if (ERRNO_IS_NOT_SUPPORTED(r
) && FLAGS_SET(flags
, POSIX_SPAWN_SETCGROUP
) && cg_is_threaded(cgroup
) > 0)
2182 return -EUCLEAN
; /* clone3() could also return EOPNOTSUPP if the target cgroup is in threaded mode,
2183 turn that into something recognizable */
2184 if ((ERRNO_IS_NOT_SUPPORTED(r
) || ERRNO_IS_PRIVILEGE(r
) || r
== E2BIG
) &&
2185 FLAGS_SET(flags
, POSIX_SPAWN_SETCGROUP
)) {
2186 /* Compiled on a newer host, or seccomp&friends blocking clone3()? Fallback, but
2187 * need to disable POSIX_SPAWN_SETCGROUP, which is what redirects to clone3().
2188 * Note that we might get E2BIG here since some kernels (e.g. 5.4) support clone3()
2189 * but not CLONE_INTO_CGROUP. */
2191 /* CLONE_INTO_CGROUP definitely won't work, hence remember the fact so that we don't
2192 * retry every time. */
2193 have_clone_into_cgroup
= false;
2195 flags
&= ~POSIX_SPAWN_SETCGROUP
;
2196 r
= posix_spawnattr_setflags(&attr
, flags
);
2200 r
= pidfd_spawn(&pidfd
, path
, NULL
, &attr
, argv
, envp
);
2205 r
= pidref_set_pidfd_consume(ret_pidref
, TAKE_FD(pidfd
));
2209 return FLAGS_SET(flags
, POSIX_SPAWN_SETCGROUP
);
2213 r
= posix_spawn(&pid
, path
, NULL
, &attr
, argv
, envp
);
2217 r
= pidref_set_pid(ret_pidref
, pid
);
2221 return 0; /* We did not use CLONE_INTO_CGROUP so return 0, the caller will have to move the child */
2225 int proc_dir_open(DIR **ret
) {
2230 d
= opendir("/proc");
2238 int proc_dir_read(DIR *d
, pid_t
*ret
) {
2245 de
= readdir_no_dot(d
);
2253 if (!IN_SET(de
->d_type
, DT_DIR
, DT_UNKNOWN
))
2256 if (parse_pid(de
->d_name
, ret
) >= 0)
2265 int proc_dir_read_pidref(DIR *d
, PidRef
*ret
) {
2273 r
= proc_dir_read(d
, &pid
);
2279 r
= pidref_set_pid(ret
, pid
);
2280 if (r
== -ESRCH
) /* gone by now? skip it */
2293 static const char *const sigchld_code_table
[] = {
2294 [CLD_EXITED
] = "exited",
2295 [CLD_KILLED
] = "killed",
2296 [CLD_DUMPED
] = "dumped",
2297 [CLD_TRAPPED
] = "trapped",
2298 [CLD_STOPPED
] = "stopped",
2299 [CLD_CONTINUED
] = "continued",
2302 DEFINE_STRING_TABLE_LOOKUP(sigchld_code
, int);
2304 static const char* const sched_policy_table
[] = {
2305 [SCHED_OTHER
] = "other",
2306 [SCHED_BATCH
] = "batch",
2307 [SCHED_IDLE
] = "idle",
2308 [SCHED_FIFO
] = "fifo",
2312 DEFINE_STRING_TABLE_LOOKUP_WITH_FALLBACK(sched_policy
, int, INT_MAX
);
2314 _noreturn_
void report_errno_and_exit(int errno_fd
, int error
) {
2318 _exit(EXIT_SUCCESS
);
2320 assert(errno_fd
>= 0);
2322 r
= loop_write(errno_fd
, &error
, sizeof(error
));
2324 log_debug_errno(r
, "Failed to write errno to errno_fd=%d: %m", errno_fd
);
2326 _exit(EXIT_FAILURE
);
2329 int read_errno(int errno_fd
) {
2332 assert(errno_fd
>= 0);
2334 /* The issue here is that it's impossible to distinguish between an error code returned by child and
2335 * IO error arose when reading it. So, the function logs errors and return EIO for the later case. */
2337 ssize_t n
= loop_read(errno_fd
, &r
, sizeof(r
), /* do_poll = */ false);
2339 log_debug_errno(n
, "Failed to read errno: %m");
2342 if (n
== sizeof(r
)) {
2345 if (r
< 0) /* child process reported an error, return it */
2346 return log_debug_errno(r
, "Child process failed with errno: %m");
2347 return log_debug_errno(SYNTHETIC_ERRNO(EIO
), "Received an errno, but it's a positive value.");
2350 return log_debug_errno(SYNTHETIC_ERRNO(EIO
), "Received unexpected amount of bytes while reading errno.");
2352 /* the process exited without reporting an error, assuming success */