1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
8 #include <sys/personality.h>
14 #if HAVE_VALGRIND_VALGRIND_H
15 #include <valgrind/valgrind.h>
18 #include "sd-messages.h"
20 #include "alloc-util.h"
21 #include "architecture.h"
22 #include "argv-util.h"
23 #include "cgroup-util.h"
24 #include "dirent-util.h"
26 #include "errno-util.h"
31 #include "hostname-util.h"
33 #include "iovec-util.h"
34 #include "locale-util.h"
36 #include "memory-util.h"
37 #include "mountpoint-util.h"
38 #include "namespace-util.h"
39 #include "nulstr-util.h"
40 #include "parse-util.h"
41 #include "path-util.h"
42 #include "pidfd-util.h"
44 #include "process-util.h"
45 #include "raw-clone.h"
46 #include "rlimit-util.h"
47 #include "signal-util.h"
48 #include "socket-util.h"
49 #include "stat-util.h"
50 #include "stdio-util.h"
51 #include "string-table.h"
52 #include "string-util.h"
53 #include "time-util.h"
54 #include "user-util.h"
56 /* The kernel limits userspace processes to TASK_COMM_LEN (16 bytes), but allows higher values for its own
57 * workers, e.g. "kworker/u9:3-kcryptd/253:0". Let's pick a fixed smallish limit that will work for the kernel.
59 #define COMM_MAX_LEN 128
61 static int get_process_state(pid_t pid
) {
62 _cleanup_free_
char *line
= NULL
;
69 /* Shortcut: if we are enquired about our own state, we are obviously running */
70 if (pid
== 0 || pid
== getpid_cached())
71 return (unsigned char) 'R';
73 p
= procfs_file_alloca(pid
, "stat");
75 r
= read_one_line_file(p
, &line
);
81 p
= strrchr(line
, ')');
87 if (sscanf(p
, " %c", &state
) != 1)
90 return (unsigned char) state
;
93 int pid_get_comm(pid_t pid
, char **ret
) {
94 _cleanup_free_
char *escaped
= NULL
, *comm
= NULL
;
100 if (pid
== 0 || pid
== getpid_cached()) {
101 comm
= new0(char, TASK_COMM_LEN
+ 1); /* Must fit in 16 byte according to prctl(2) */
105 if (prctl(PR_GET_NAME
, comm
) < 0)
110 p
= procfs_file_alloca(pid
, "comm");
112 /* Note that process names of kernel threads can be much longer than TASK_COMM_LEN */
113 r
= read_one_line_file(p
, &comm
);
120 escaped
= new(char, COMM_MAX_LEN
);
124 /* Escape unprintable characters, just in case, but don't grow the string beyond the underlying size */
125 cellescape(escaped
, COMM_MAX_LEN
, comm
);
127 *ret
= TAKE_PTR(escaped
);
131 int pidref_get_comm(const PidRef
*pid
, char **ret
) {
132 _cleanup_free_
char *comm
= NULL
;
135 if (!pidref_is_set(pid
))
138 if (pidref_is_remote(pid
))
141 r
= pid_get_comm(pid
->pid
, &comm
);
145 r
= pidref_verify(pid
);
150 *ret
= TAKE_PTR(comm
);
154 static int pid_get_cmdline_nulstr(
157 ProcessCmdlineFlags flags
,
161 _cleanup_free_
char *t
= NULL
;
166 /* Retrieves a process' command line as a "sized nulstr", i.e. possibly without the last NUL, but
167 * with a specified size.
169 * If PROCESS_CMDLINE_COMM_FALLBACK is specified in flags and the process has no command line set
170 * (the case for kernel threads), or has a command line that resolves to the empty string, will
171 * return the "comm" name of the process instead. This will use at most _SC_ARG_MAX bytes of input
174 * Returns an error, 0 if output was read but is truncated, 1 otherwise.
177 p
= procfs_file_alloca(pid
, "cmdline");
178 r
= read_virtual_file(p
, max_size
, &t
, &k
); /* Let's assume that each input byte results in >= 1
179 * columns of output. We ignore zero-width codepoints. */
186 if (!(flags
& PROCESS_CMDLINE_COMM_FALLBACK
))
189 /* Kernel threads have no argv[] */
190 _cleanup_free_
char *comm
= NULL
;
192 r
= pid_get_comm(pid
, &comm
);
197 t
= strjoin("[", comm
, "]");
203 if (r
== 0) /* truncation */
215 int pid_get_cmdline(pid_t pid
, size_t max_columns
, ProcessCmdlineFlags flags
, char **ret
) {
216 _cleanup_free_
char *t
= NULL
;
223 /* Retrieve and format a command line. See above for discussion of retrieval options.
225 * There are two main formatting modes:
227 * - when PROCESS_CMDLINE_QUOTE is specified, output is quoted in C/Python style. If no shell special
228 * characters are present, this output can be copy-pasted into the terminal to execute. UTF-8
231 * - otherwise, a compact non-roundtrippable form is returned. Non-UTF8 bytes are replaced by �. The
232 * returned string is of the specified console width at most, abbreviated with an ellipsis.
234 * Returns -ESRCH if the process doesn't exist, and -ENOENT if the process has no command line (and
235 * PROCESS_CMDLINE_COMM_FALLBACK is not specified). Returns 0 and sets *line otherwise. */
237 int full
= pid_get_cmdline_nulstr(pid
, max_columns
, flags
, &t
, &k
);
241 if (flags
& (PROCESS_CMDLINE_QUOTE
| PROCESS_CMDLINE_QUOTE_POSIX
)) {
242 ShellEscapeFlags shflags
= SHELL_ESCAPE_EMPTY
|
243 FLAGS_SET(flags
, PROCESS_CMDLINE_QUOTE_POSIX
) * SHELL_ESCAPE_POSIX
;
245 assert(!(flags
& PROCESS_CMDLINE_USE_LOCALE
));
247 _cleanup_strv_free_
char **args
= NULL
;
249 /* Drop trailing NULs, otherwise strv_parse_nulstr() adds additional empty strings at the end.
250 * See also issue #21186. */
251 args
= strv_parse_nulstr_full(t
, k
, /* drop_trailing_nuls = */ true);
255 ans
= quote_command_line(args
, shflags
);
259 /* Arguments are separated by NULs. Let's replace those with spaces. */
260 for (size_t i
= 0; i
< k
- 1; i
++)
264 delete_trailing_chars(t
, WHITESPACE
);
266 bool eight_bit
= (flags
& PROCESS_CMDLINE_USE_LOCALE
) && !is_locale_utf8();
268 ans
= escape_non_printable_full(t
, max_columns
,
269 eight_bit
* XESCAPE_8_BIT
| !full
* XESCAPE_FORCE_ELLIPSIS
);
273 ans
= str_realloc(ans
);
280 int pidref_get_cmdline(const PidRef
*pid
, size_t max_columns
, ProcessCmdlineFlags flags
, char **ret
) {
281 _cleanup_free_
char *s
= NULL
;
284 if (!pidref_is_set(pid
))
287 if (pidref_is_remote(pid
))
290 r
= pid_get_cmdline(pid
->pid
, max_columns
, flags
, &s
);
294 r
= pidref_verify(pid
);
303 int pid_get_cmdline_strv(pid_t pid
, ProcessCmdlineFlags flags
, char ***ret
) {
304 _cleanup_free_
char *t
= NULL
;
310 assert((flags
& ~PROCESS_CMDLINE_COMM_FALLBACK
) == 0);
313 r
= pid_get_cmdline_nulstr(pid
, SIZE_MAX
, flags
, &t
, &k
);
317 args
= strv_parse_nulstr_full(t
, k
, /* drop_trailing_nuls = */ true);
325 int pidref_get_cmdline_strv(const PidRef
*pid
, ProcessCmdlineFlags flags
, char ***ret
) {
326 _cleanup_strv_free_
char **args
= NULL
;
329 if (!pidref_is_set(pid
))
332 if (pidref_is_remote(pid
))
335 r
= pid_get_cmdline_strv(pid
->pid
, flags
, &args
);
339 r
= pidref_verify(pid
);
344 *ret
= TAKE_PTR(args
);
349 int container_get_leader(const char *machine
, pid_t
*pid
) {
350 _cleanup_free_
char *s
= NULL
, *class = NULL
;
358 if (streq(machine
, ".host")) {
363 if (!hostname_is_valid(machine
, 0))
366 p
= strjoina("/run/systemd/machines/", machine
);
367 r
= parse_env_file(NULL
, p
,
377 if (!streq_ptr(class, "container"))
380 r
= parse_pid(s
, &leader
);
390 int pid_is_kernel_thread(pid_t pid
) {
391 _cleanup_free_
char *line
= NULL
;
392 unsigned long long flags
;
398 if (IN_SET(pid
, 0, 1) || pid
== getpid_cached()) /* pid 1, and we ourselves certainly aren't a kernel thread */
400 if (!pid_is_valid(pid
))
403 p
= procfs_file_alloca(pid
, "stat");
404 r
= read_one_line_file(p
, &line
);
410 /* Skip past the comm field */
411 q
= strrchr(line
, ')');
416 /* Skip 6 fields to reach the flags field */
417 for (i
= 0; i
< 6; i
++) {
418 l
= strspn(q
, WHITESPACE
);
423 l
= strcspn(q
, WHITESPACE
);
429 /* Skip preceding whitespace */
430 l
= strspn(q
, WHITESPACE
);
435 /* Truncate the rest */
436 l
= strcspn(q
, WHITESPACE
);
441 r
= safe_atollu(q
, &flags
);
445 return !!(flags
& PF_KTHREAD
);
448 int pidref_is_kernel_thread(const PidRef
*pid
) {
451 if (!pidref_is_set(pid
))
454 if (pidref_is_remote(pid
))
457 result
= pid_is_kernel_thread(pid
->pid
);
461 r
= pidref_verify(pid
); /* Verify that the PID wasn't reused since */
468 static int get_process_link_contents(pid_t pid
, const char *proc_file
, char **ret
) {
474 p
= procfs_file_alloca(pid
, proc_file
);
476 r
= readlink_malloc(p
, ret
);
477 return (r
== -ENOENT
&& proc_mounted() > 0) ? -ESRCH
: r
;
480 int get_process_exe(pid_t pid
, char **ret
) {
486 r
= get_process_link_contents(pid
, "exe", ret
);
491 d
= endswith(*ret
, " (deleted)");
499 int pid_get_uid(pid_t pid
, uid_t
*ret
) {
505 if (pid
== 0 || pid
== getpid_cached()) {
510 _cleanup_free_
char *v
= NULL
;
511 r
= procfs_file_get_field(pid
, "status", "Uid", &v
);
517 return parse_uid(v
, ret
);
520 int pidref_get_uid(const PidRef
*pid
, uid_t
*ret
) {
523 if (!pidref_is_set(pid
))
526 if (pidref_is_remote(pid
))
530 r
= pidfd_get_uid(pid
->fd
, ret
);
531 if (!ERRNO_IS_NEG_NOT_SUPPORTED(r
))
536 r
= pid_get_uid(pid
->pid
, &uid
);
540 r
= pidref_verify(pid
);
549 int get_process_gid(pid_t pid
, gid_t
*ret
) {
555 if (pid
== 0 || pid
== getpid_cached()) {
560 _cleanup_free_
char *v
= NULL
;
561 r
= procfs_file_get_field(pid
, "status", "Gid", &v
);
567 return parse_gid(v
, ret
);
570 int get_process_cwd(pid_t pid
, char **ret
) {
573 if (pid
== 0 || pid
== getpid_cached())
574 return safe_getcwd(ret
);
576 return get_process_link_contents(pid
, "cwd", ret
);
579 int get_process_root(pid_t pid
, char **ret
) {
581 return get_process_link_contents(pid
, "root", ret
);
584 #define ENVIRONMENT_BLOCK_MAX (5U*1024U*1024U)
586 int get_process_environ(pid_t pid
, char **ret
) {
587 _cleanup_fclose_
FILE *f
= NULL
;
588 _cleanup_free_
char *outcome
= NULL
;
596 p
= procfs_file_alloca(pid
, "environ");
598 r
= fopen_unlocked(p
, "re", &f
);
607 if (sz
>= ENVIRONMENT_BLOCK_MAX
)
610 if (!GREEDY_REALLOC(outcome
, sz
+ 5))
613 r
= safe_fgetc(f
, &c
);
620 outcome
[sz
++] = '\n';
622 sz
+= cescape_char(c
, outcome
+ sz
);
626 *ret
= TAKE_PTR(outcome
);
631 int pid_get_ppid(pid_t pid
, pid_t
*ret
) {
632 _cleanup_free_
char *line
= NULL
;
640 pid
= getpid_cached();
641 if (pid
== 1) /* PID 1 has no parent, shortcut this case */
642 return -EADDRNOTAVAIL
;
644 if (pid
== getpid_cached()) {
650 p
= procfs_file_alloca(pid
, "stat");
651 r
= read_one_line_file(p
, &line
);
657 /* Let's skip the pid and comm fields. The latter is enclosed in () but does not escape any () in its
658 * value, so let's skip over it manually */
660 p
= strrchr(line
, ')');
671 /* If ppid is zero the process has no parent. Which might be the case for PID 1 (caught above)
672 * but also for processes originating in other namespaces that are inserted into a pidns.
673 * Return a recognizable error in this case. */
675 return -EADDRNOTAVAIL
;
677 if ((pid_t
) ppid
< 0 || (unsigned long) (pid_t
) ppid
!= ppid
)
686 int pidref_get_ppid(const PidRef
*pidref
, pid_t
*ret
) {
689 if (!pidref_is_set(pidref
))
692 if (pidref_is_remote(pidref
))
695 if (pidref
->fd
>= 0) {
696 r
= pidfd_get_ppid(pidref
->fd
, ret
);
697 if (!ERRNO_IS_NEG_NOT_SUPPORTED(r
))
702 r
= pid_get_ppid(pidref
->pid
, ret
? &ppid
: NULL
);
706 r
= pidref_verify(pidref
);
715 int pidref_get_ppid_as_pidref(const PidRef
*pidref
, PidRef
*ret
) {
721 r
= pidref_get_ppid(pidref
, &ppid
);
725 for (unsigned attempt
= 0; attempt
< 16; attempt
++) {
726 _cleanup_(pidref_done
) PidRef parent
= PIDREF_NULL
;
728 r
= pidref_set_pid(&parent
, ppid
);
732 /* If we have a pidfd of the original PID, let's verify that the process we acquired really
733 * is the parent still */
734 if (pidref
->fd
>= 0) {
735 r
= pidref_get_ppid(pidref
, &ppid
);
739 /* Did the PPID change since we queried it? if so we might have pinned the wrong
740 * process, if its PID got reused by now. Let's try again */
741 if (parent
.pid
!= ppid
)
745 *ret
= TAKE_PIDREF(parent
);
749 /* Give up after 16 tries */
750 return -ENOTRECOVERABLE
;
753 int pid_get_start_time(pid_t pid
, usec_t
*ret
) {
754 _cleanup_free_
char *line
= NULL
;
760 p
= procfs_file_alloca(pid
, "stat");
761 r
= read_one_line_file(p
, &line
);
767 /* Let's skip the pid and comm fields. The latter is enclosed in () but does not escape any () in its
768 * value, so let's skip over it manually */
770 p
= strrchr(line
, ')');
793 "%*i " /* priority */
795 "%*u " /* num_threads */
796 "%*u " /* itrealvalue */
797 "%lu ", /* starttime */
802 *ret
= jiffies_to_usec(llu
); /* CLOCK_BOOTTIME */
807 int pidref_get_start_time(const PidRef
*pid
, usec_t
*ret
) {
811 if (!pidref_is_set(pid
))
814 if (pidref_is_remote(pid
))
817 r
= pid_get_start_time(pid
->pid
, ret
? &t
: NULL
);
821 r
= pidref_verify(pid
);
831 int get_process_umask(pid_t pid
, mode_t
*ret
) {
832 _cleanup_free_
char *m
= NULL
;
838 r
= procfs_file_get_field(pid
, "status", "Umask", &m
);
844 return parse_mode(m
, ret
);
847 int wait_for_terminate(pid_t pid
, siginfo_t
*ret
) {
848 return pidref_wait_for_terminate(&PIDREF_MAKE_FROM_PID(pid
), ret
);
853 * < 0 : wait_for_terminate() failed to get the state of the
854 * process, the process was terminated by a signal, or
855 * failed for an unknown reason.
856 * >=0 : The process terminated normally, and its exit code is
859 * That is, success is indicated by a return value of zero, and an
860 * error is indicated by a non-zero value.
862 * A warning is emitted if the process terminates abnormally,
863 * and also if it returns non-zero unless check_exit_code is true.
865 int pidref_wait_for_terminate_and_check(const char *name
, PidRef
*pidref
, WaitFlags flags
) {
868 if (!pidref_is_set(pidref
))
870 if (pidref_is_remote(pidref
))
872 if (pidref
->pid
== 1 || pidref_is_self(pidref
))
875 _cleanup_free_
char *buffer
= NULL
;
877 r
= pidref_get_comm(pidref
, &buffer
);
879 log_debug_errno(r
, "Failed to acquire process name of " PID_FMT
", ignoring: %m", pidref
->pid
);
884 int prio
= flags
& WAIT_LOG_ABNORMAL
? LOG_ERR
: LOG_DEBUG
;
887 r
= pidref_wait_for_terminate(pidref
, &status
);
889 return log_full_errno(prio
, r
, "Failed to wait for %s: %m", strna(name
));
891 if (status
.si_code
== CLD_EXITED
) {
892 if (status
.si_status
!= EXIT_SUCCESS
)
893 log_full(flags
& WAIT_LOG_NON_ZERO_EXIT_STATUS
? LOG_ERR
: LOG_DEBUG
,
894 "%s failed with exit status %i.", strna(name
), status
.si_status
);
896 log_debug("%s succeeded.", name
);
898 return status
.si_status
;
900 } else if (IN_SET(status
.si_code
, CLD_KILLED
, CLD_DUMPED
)) {
902 log_full(prio
, "%s terminated by signal %s.", strna(name
), signal_to_string(status
.si_status
));
906 log_full(prio
, "%s failed due to unknown reason.", strna(name
));
910 int wait_for_terminate_and_check(const char *name
, pid_t pid
, WaitFlags flags
) {
911 return pidref_wait_for_terminate_and_check(name
, &PIDREF_MAKE_FROM_PID(pid
), flags
);
917 * < 0 : wait_for_terminate_with_timeout() failed to get the state of the process, the process timed out, the process
918 * was terminated by a signal, or failed for an unknown reason.
920 * >=0 : The process terminated normally with no failures.
922 * Success is indicated by a return value of zero, a timeout is indicated by ETIMEDOUT, and all other child failure
923 * states are indicated by error is indicated by a non-zero value.
925 * This call assumes SIGCHLD has been blocked already, in particular before the child to wait for has been forked off
926 * to remain entirely race-free.
928 int wait_for_terminate_with_timeout(pid_t pid
, usec_t timeout
) {
933 assert_se(sigemptyset(&mask
) == 0);
934 assert_se(sigaddset(&mask
, SIGCHLD
) == 0);
936 /* Drop into a sigtimewait-based timeout. Waiting for the
938 until
= usec_add(now(CLOCK_MONOTONIC
), timeout
);
941 siginfo_t status
= {};
943 n
= now(CLOCK_MONOTONIC
);
947 r
= RET_NERRNO(sigtimedwait(&mask
, NULL
, TIMESPEC_STORE(until
- n
)));
948 /* Assuming we woke due to the child exiting. */
949 if (waitid(P_PID
, pid
, &status
, WEXITED
|WNOHANG
) == 0) {
950 if (status
.si_pid
== pid
) {
951 /* This is the correct child. */
952 if (status
.si_code
== CLD_EXITED
)
953 return status
.si_status
== 0 ? 0 : -EPROTO
;
958 /* Not the child, check for errors and proceed appropriately */
962 /* Timed out, child is likely hung. */
965 /* Received a different signal and should retry */
968 /* Return any unexpected errors */
977 void sigkill_wait(pid_t pid
) {
980 (void) kill(pid
, SIGKILL
);
981 (void) wait_for_terminate(pid
, NULL
);
984 void sigkill_waitp(pid_t
*pid
) {
995 void sigterm_wait(pid_t pid
) {
998 (void) kill_and_sigcont(pid
, SIGTERM
);
999 (void) wait_for_terminate(pid
, NULL
);
1002 void sigkill_nowait(pid_t pid
) {
1005 (void) kill(pid
, SIGKILL
);
1008 void sigkill_nowaitp(pid_t
*pid
) {
1016 sigkill_nowait(*pid
);
1019 int kill_and_sigcont(pid_t pid
, int sig
) {
1022 r
= RET_NERRNO(kill(pid
, sig
));
1024 /* If this worked, also send SIGCONT, unless we already just sent a SIGCONT, or SIGKILL was sent which isn't
1025 * affected by a process being suspended anyway. */
1026 if (r
>= 0 && !IN_SET(sig
, SIGCONT
, SIGKILL
))
1027 (void) kill(pid
, SIGCONT
);
1032 int getenv_for_pid(pid_t pid
, const char *field
, char **ret
) {
1033 _cleanup_fclose_
FILE *f
= NULL
;
1042 if (pid
== 0 || pid
== getpid_cached())
1043 return strdup_to_full(ret
, getenv(field
));
1045 if (!pid_is_valid(pid
))
1048 path
= procfs_file_alloca(pid
, "environ");
1050 r
= fopen_unlocked(path
, "re", &f
);
1057 _cleanup_free_
char *line
= NULL
;
1060 if (sum
> ENVIRONMENT_BLOCK_MAX
) /* Give up searching eventually */
1063 r
= read_nul_string(f
, LONG_LINE_MAX
, &line
);
1066 if (r
== 0) /* EOF */
1071 match
= startswith(line
, field
);
1072 if (match
&& *match
== '=')
1073 return strdup_to_full(ret
, match
+ 1);
1080 int pidref_is_my_child(PidRef
*pid
) {
1083 if (!pidref_is_set(pid
))
1086 if (pidref_is_remote(pid
))
1089 if (pid
->pid
== 1 || pidref_is_self(pid
))
1093 r
= pidref_get_ppid(pid
, &ppid
);
1094 if (r
== -EADDRNOTAVAIL
) /* if this process is outside of our pidns, it is definitely not our child */
1099 return ppid
== getpid_cached();
1102 int pid_is_my_child(pid_t pid
) {
1107 return pidref_is_my_child(&PIDREF_MAKE_FROM_PID(pid
));
1110 int pidref_is_unwaited(PidRef
*pid
) {
1113 /* Checks whether a PID is still valid at all, including a zombie */
1115 if (!pidref_is_set(pid
))
1118 if (pidref_is_remote(pid
))
1121 if (pid
->pid
== 1 || pidref_is_self(pid
))
1124 r
= pidref_kill(pid
, 0);
1133 int pid_is_unwaited(pid_t pid
) {
1138 return pidref_is_unwaited(&PIDREF_MAKE_FROM_PID(pid
));
1141 int pid_is_alive(pid_t pid
) {
1144 /* Checks whether a PID is still valid and not a zombie */
1149 if (pid
<= 1) /* If we or PID 1 would be a zombie, this code would not be running */
1152 if (pid
== getpid_cached())
1155 r
= get_process_state(pid
);
1164 int pidref_is_alive(const PidRef
*pidref
) {
1167 if (!pidref_is_set(pidref
))
1170 if (pidref_is_remote(pidref
))
1173 result
= pid_is_alive(pidref
->pid
);
1175 assert(result
!= -ESRCH
);
1179 r
= pidref_verify(pidref
);
1188 int pidref_from_same_root_fs(PidRef
*a
, PidRef
*b
) {
1189 _cleanup_(pidref_done
) PidRef self
= PIDREF_NULL
;
1192 /* Checks if the two specified processes have the same root fs. Either can be specified as NULL in
1193 * which case we'll check against ourselves. */
1196 r
= pidref_set_self(&self
);
1205 if (!pidref_is_set(a
) || !pidref_is_set(b
))
1208 /* If one of the two processes have the same root they cannot have the same root fs, but if both of
1209 * them do we don't know */
1210 if (pidref_is_remote(a
) && pidref_is_remote(b
))
1212 if (pidref_is_remote(a
) || pidref_is_remote(b
))
1215 if (pidref_equal(a
, b
))
1218 const char *roota
= procfs_file_alloca(a
->pid
, "root");
1219 const char *rootb
= procfs_file_alloca(b
->pid
, "root");
1221 int result
= inode_same(roota
, rootb
, 0);
1222 if (result
== -ENOENT
)
1223 return proc_mounted() == 0 ? -ENOSYS
: -ESRCH
;
1227 r
= pidref_verify(a
);
1230 r
= pidref_verify(b
);
1237 bool is_main_thread(void) {
1238 static thread_local
int cached
= -1;
1241 cached
= getpid_cached() == gettid();
1246 bool oom_score_adjust_is_valid(int oa
) {
1247 return oa
>= OOM_SCORE_ADJ_MIN
&& oa
<= OOM_SCORE_ADJ_MAX
;
1250 unsigned long personality_from_string(const char *p
) {
1251 Architecture architecture
;
1254 return PERSONALITY_INVALID
;
1256 /* Parse a personality specifier. We use our own identifiers that indicate specific ABIs, rather than just
1257 * hints regarding the register size, since we want to keep things open for multiple locally supported ABIs for
1258 * the same register size. */
1260 architecture
= architecture_from_string(p
);
1261 if (architecture
< 0)
1262 return PERSONALITY_INVALID
;
1264 if (architecture
== native_architecture())
1266 #ifdef ARCHITECTURE_SECONDARY
1267 if (architecture
== ARCHITECTURE_SECONDARY
)
1271 return PERSONALITY_INVALID
;
1274 const char* personality_to_string(unsigned long p
) {
1275 Architecture architecture
= _ARCHITECTURE_INVALID
;
1278 architecture
= native_architecture();
1279 #ifdef ARCHITECTURE_SECONDARY
1280 else if (p
== PER_LINUX32
)
1281 architecture
= ARCHITECTURE_SECONDARY
;
1284 if (architecture
< 0)
1287 return architecture_to_string(architecture
);
1290 int safe_personality(unsigned long p
) {
1293 /* So here's the deal, personality() is weirdly defined by glibc. In some cases it returns a failure via errno,
1294 * and in others as negative return value containing an errno-like value. Let's work around this: this is a
1295 * wrapper that uses errno if it is set, and uses the return value otherwise. And then it sets both errno and
1296 * the return value indicating the same issue, so that we are definitely on the safe side.
1298 * See https://github.com/systemd/systemd/issues/6737 */
1301 ret
= personality(p
);
1312 int opinionated_personality(unsigned long *ret
) {
1315 /* Returns the current personality, or PERSONALITY_INVALID if we can't determine it. This function is a bit
1316 * opinionated though, and ignores all the finer-grained bits and exotic personalities, only distinguishing the
1317 * two most relevant personalities: PER_LINUX and PER_LINUX32. */
1319 current
= safe_personality(PERSONALITY_INVALID
);
1323 if (((unsigned long) current
& OPINIONATED_PERSONALITY_MASK
) == PER_LINUX32
)
1331 void valgrind_summary_hack(void) {
1332 #if HAVE_VALGRIND_VALGRIND_H
1333 if (getpid_cached() == 1 && RUNNING_ON_VALGRIND
) {
1335 pid
= raw_clone(SIGCHLD
);
1339 LOG_MESSAGE_ID(SD_MESSAGE_VALGRIND_HELPER_FORK_STR
),
1340 LOG_MESSAGE("Failed to fork off valgrind helper: %m"));
1344 log_info("Spawned valgrind helper as PID "PID_FMT
".", pid
);
1345 (void) wait_for_terminate(pid
, NULL
);
1351 int pid_compare_func(const pid_t
*a
, const pid_t
*b
) {
1352 /* Suitable for usage in qsort() */
1356 bool nice_is_valid(int n
) {
1357 return n
>= PRIO_MIN
&& n
< PRIO_MAX
;
1360 bool sched_policy_is_valid(int i
) {
1361 return IN_SET(i
, SCHED_OTHER
, SCHED_BATCH
, SCHED_IDLE
, SCHED_FIFO
, SCHED_RR
);
1364 bool sched_priority_is_valid(int i
) {
1365 return i
>= 0 && i
<= sched_get_priority_max(SCHED_RR
);
1368 /* The cached PID, possible values:
1370 * == UNSET [0] → cache not initialized yet
1371 * == BUSY [-1] → some thread is initializing it at the moment
1372 * any other → the cached PID
1375 #define CACHED_PID_UNSET ((pid_t) 0)
1376 #define CACHED_PID_BUSY ((pid_t) -1)
1378 static pid_t cached_pid
= CACHED_PID_UNSET
;
1380 void reset_cached_pid(void) {
1381 /* Invoked in the child after a fork(), i.e. at the first moment the PID changed */
1382 cached_pid
= CACHED_PID_UNSET
;
1385 pid_t
getpid_cached(void) {
1386 static bool installed
= false;
1387 pid_t current_value
= CACHED_PID_UNSET
;
1389 /* getpid_cached() is much like getpid(), but caches the value in local memory, to avoid having to invoke a
1390 * system call each time. This restores glibc behaviour from before 2.24, when getpid() was unconditionally
1391 * cached. Starting with 2.24 getpid() started to become prohibitively expensive when used for detecting when
1392 * objects were used across fork()s. With this caching the old behaviour is somewhat restored.
1394 * https://bugzilla.redhat.com/show_bug.cgi?id=1443976
1395 * https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=c579f48edba88380635ab98cb612030e3ed8691e
1398 (void) __atomic_compare_exchange_n(
1406 switch (current_value
) {
1408 case CACHED_PID_UNSET
: { /* Not initialized yet, then do so now */
1414 /* __register_atfork() either returns 0 or -ENOMEM, in its glibc implementation. Since it's
1415 * only half-documented (glibc doesn't document it but LSB does — though only superficially)
1416 * we'll check for errors only in the most generic fashion possible. */
1418 if (pthread_atfork(NULL
, NULL
, reset_cached_pid
) != 0) {
1419 /* OOM? Let's try again later */
1420 cached_pid
= CACHED_PID_UNSET
;
1427 cached_pid
= new_pid
;
1431 case CACHED_PID_BUSY
: /* Somebody else is currently initializing */
1434 default: /* Properly initialized */
1435 return current_value
;
1439 int must_be_root(void) {
1444 return log_error_errno(SYNTHETIC_ERRNO(EPERM
), "Need to be root.");
1447 pid_t
clone_with_nested_stack(int (*fn
)(void *), int flags
, void *userdata
) {
1452 /* A wrapper around glibc's clone() call that automatically sets up a "nested" stack. Only supports
1453 * invocations without CLONE_VM, so that we can continue to use the parent's stack mapping.
1455 * Note: glibc's clone() wrapper does not synchronize malloc() locks. This means that if the parent
1456 * is threaded these locks will be in an undefined state in the child, and hence memory allocations
1457 * are likely going to run into deadlocks. Hence: if you use this function make sure your parent is
1458 * strictly single-threaded or your child never calls malloc(). */
1460 assert((flags
& (CLONE_VM
|CLONE_PARENT_SETTID
|CLONE_CHILD_SETTID
|
1461 CLONE_CHILD_CLEARTID
|CLONE_SETTLS
)) == 0);
1463 /* We allocate some space on the stack to use as the stack for the child (hence "nested"). Note that
1464 * the net effect is that the child will have the start of its stack inside the stack of the parent,
1465 * but since they are a CoW copy of each other that's fine. We allocate one page-aligned page. But
1466 * since we don't want to deal with differences between systems where the stack grows backwards or
1467 * forwards we'll allocate one more and place the stack address in the middle. Except that we also
1468 * want it page aligned, hence we'll allocate one page more. Makes 3. */
1471 mystack
= alloca(ps
*3);
1472 mystack
= (uint8_t*) mystack
+ ps
; /* move pointer one page ahead since stacks usually grow backwards */
1473 mystack
= (void*) ALIGN_TO((uintptr_t) mystack
, ps
); /* align to page size (moving things further ahead) */
1476 pid
= clone(fn
, mystack
, flags
, userdata
);
1478 pid
= __clone2(fn
, mystack
, ps
, flags
, userdata
);
1486 static void restore_sigsetp(sigset_t
**ssp
) {
1488 (void) sigprocmask(SIG_SETMASK
, *ssp
, NULL
);
1491 static int fork_flags_to_signal(ForkFlags flags
) {
1492 return (flags
& FORK_DEATHSIG_SIGTERM
) ? SIGTERM
:
1493 (flags
& FORK_DEATHSIG_SIGINT
) ? SIGINT
:
1497 int pidref_safe_fork_full(
1499 const int stdio_fds
[3],
1501 size_t n_except_fds
,
1505 pid_t original_pid
, pid
;
1506 sigset_t saved_ss
, ss
;
1507 _unused_
_cleanup_(restore_sigsetp
) sigset_t
*saved_ssp
= NULL
;
1508 bool block_signals
= false, block_all
= false, intermediary
= false;
1509 _cleanup_close_pair_
int pidref_transport_fds
[2] = EBADF_PAIR
;
1512 assert(!FLAGS_SET(flags
, FORK_WAIT
|FORK_FREEZE
));
1513 assert(!FLAGS_SET(flags
, FORK_DETACH
) ||
1514 (flags
& (FORK_WAIT
|FORK_DEATHSIG_SIGTERM
|FORK_DEATHSIG_SIGINT
|FORK_DEATHSIG_SIGKILL
)) == 0);
1516 /* A wrapper around fork(), that does a couple of important initializations in addition to mere
1517 * forking. If provided, ret_pid is initialized in both the parent and the child process, both times
1518 * referencing the child process. Returns == 0 in the child and > 0 in the parent. */
1520 prio
= flags
& FORK_LOG
? LOG_ERR
: LOG_DEBUG
;
1522 original_pid
= getpid_cached();
1524 if (flags
& FORK_FLUSH_STDIO
) {
1526 fflush(stderr
); /* This one shouldn't be necessary, stderr should be unbuffered anyway, but let's better be safe than sorry */
1529 if (flags
& (FORK_RESET_SIGNALS
|FORK_DEATHSIG_SIGTERM
|FORK_DEATHSIG_SIGINT
)) {
1530 /* We temporarily block all signals, so that the new child has them blocked initially. This
1531 * way, we can be sure that SIGTERMs are not lost we might send to the child. (Note that for
1532 * FORK_DEATHSIG_SIGKILL we don't bother, since it cannot be blocked anyway.) */
1534 assert_se(sigfillset(&ss
) >= 0);
1535 block_signals
= block_all
= true;
1537 } else if (flags
& FORK_WAIT
) {
1538 /* Let's block SIGCHLD at least, so that we can safely watch for the child process */
1540 assert_se(sigemptyset(&ss
) >= 0);
1541 assert_se(sigaddset(&ss
, SIGCHLD
) >= 0);
1542 block_signals
= true;
1545 if (block_signals
) {
1546 if (sigprocmask(SIG_BLOCK
, &ss
, &saved_ss
) < 0)
1547 return log_full_errno(prio
, errno
, "Failed to block signal mask: %m");
1548 saved_ssp
= &saved_ss
;
1551 if (FLAGS_SET(flags
, FORK_DETACH
)) {
1552 /* Fork off intermediary child if needed */
1554 r
= is_reaper_process();
1556 return log_full_errno(prio
, r
, "Failed to determine if we are a reaper process: %m");
1559 /* Not a reaper process, hence do a double fork() so we are reparented to one */
1561 if (ret_pid
&& socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, pidref_transport_fds
) < 0)
1562 return log_full_errno(prio
, errno
, "Failed to allocate pidref socket: %m");
1566 return log_full_errno(prio
, errno
, "Failed to fork off '%s': %m", strna(name
));
1568 log_debug("Successfully forked off intermediary '%s' as PID " PID_FMT
".", strna(name
), pid
);
1570 pidref_transport_fds
[1] = safe_close(pidref_transport_fds
[1]);
1572 if (pidref_transport_fds
[0] >= 0) {
1573 /* Wait for the intermediary child to exit so the caller can be certain the actual child
1574 * process has been reparented by the time this function returns. */
1575 r
= wait_for_terminate_and_check(name
, pid
, FLAGS_SET(flags
, FORK_LOG
) ? WAIT_LOG
: 0);
1577 return log_full_errno(prio
, r
, "Failed to wait for intermediary process: %m");
1578 if (r
!= EXIT_SUCCESS
) /* exit status > 0 should be treated as failure, too */
1582 ssize_t n
= receive_one_fd_iov(
1583 pidref_transport_fds
[0],
1584 &IOVEC_MAKE(&pid
, sizeof(pid
)),
1589 return log_full_errno(prio
, n
, "Failed to receive child pidref: %m");
1591 *ret_pid
= (PidRef
) { .pid
= pid
, .fd
= pidfd
};
1594 return 1; /* return in the parent */
1597 pidref_transport_fds
[0] = safe_close(pidref_transport_fds
[0]);
1598 intermediary
= true;
1602 if ((flags
& (FORK_NEW_MOUNTNS
|FORK_NEW_USERNS
|FORK_NEW_NETNS
|FORK_NEW_PIDNS
)) != 0)
1603 pid
= raw_clone(SIGCHLD
|
1604 (FLAGS_SET(flags
, FORK_NEW_MOUNTNS
) ? CLONE_NEWNS
: 0) |
1605 (FLAGS_SET(flags
, FORK_NEW_USERNS
) ? CLONE_NEWUSER
: 0) |
1606 (FLAGS_SET(flags
, FORK_NEW_NETNS
) ? CLONE_NEWNET
: 0) |
1607 (FLAGS_SET(flags
, FORK_NEW_PIDNS
) ? CLONE_NEWPID
: 0));
1611 return log_full_errno(prio
, errno
, "Failed to fork off '%s': %m", strna(name
));
1614 /* If we are in the intermediary process, exit now */
1616 if (pidref_transport_fds
[1] >= 0) {
1617 _cleanup_(pidref_done
) PidRef pidref
= PIDREF_NULL
;
1619 r
= pidref_set_pid(&pidref
, pid
);
1621 log_full_errno(prio
, r
, "Failed to open reference to PID "PID_FMT
": %m", pid
);
1622 _exit(EXIT_FAILURE
);
1625 r
= send_one_fd_iov(
1626 pidref_transport_fds
[1],
1628 &IOVEC_MAKE(&pidref
.pid
, sizeof(pidref
.pid
)),
1632 log_full_errno(prio
, r
, "Failed to send child pidref: %m");
1633 _exit(EXIT_FAILURE
);
1637 _exit(EXIT_SUCCESS
);
1640 /* We are in the parent process */
1641 log_debug("Successfully forked off '%s' as PID " PID_FMT
".", strna(name
), pid
);
1643 if (flags
& FORK_WAIT
) {
1645 /* undo everything except SIGCHLD */
1647 assert_se(sigaddset(&ss
, SIGCHLD
) >= 0);
1648 (void) sigprocmask(SIG_SETMASK
, &ss
, NULL
);
1651 r
= wait_for_terminate_and_check(name
, pid
, (flags
& FORK_LOG
? WAIT_LOG
: 0));
1654 if (r
!= EXIT_SUCCESS
) /* exit status > 0 should be treated as failure, too */
1657 /* If we are in the parent and successfully waited, then the process doesn't exist anymore. */
1659 *ret_pid
= PIDREF_NULL
;
1665 if (FLAGS_SET(flags
, FORK_PID_ONLY
))
1666 *ret_pid
= PIDREF_MAKE_FROM_PID(pid
);
1668 r
= pidref_set_pid(ret_pid
, pid
);
1669 if (r
< 0) /* Let's not fail for this, no matter what, the process exists after all, and that's key */
1670 *ret_pid
= PIDREF_MAKE_FROM_PID(pid
);
1677 /* We are in the child process */
1679 pidref_transport_fds
[1] = safe_close(pidref_transport_fds
[1]);
1681 /* Restore signal mask manually */
1684 if (flags
& FORK_REOPEN_LOG
) {
1685 /* Close the logs if requested, before we log anything. And make sure we reopen it if needed. */
1687 log_set_open_when_needed(true);
1688 log_settle_target();
1692 r
= rename_process(name
);
1694 log_full_errno(flags
& FORK_LOG
? LOG_WARNING
: LOG_DEBUG
,
1695 r
, "Failed to rename process, ignoring: %m");
1698 if (flags
& (FORK_DEATHSIG_SIGTERM
|FORK_DEATHSIG_SIGINT
|FORK_DEATHSIG_SIGKILL
))
1699 if (prctl(PR_SET_PDEATHSIG
, fork_flags_to_signal(flags
)) < 0) {
1700 log_full_errno(prio
, errno
, "Failed to set death signal: %m");
1701 _exit(EXIT_FAILURE
);
1704 if (flags
& FORK_RESET_SIGNALS
) {
1705 r
= reset_all_signal_handlers();
1707 log_full_errno(prio
, r
, "Failed to reset signal handlers: %m");
1708 _exit(EXIT_FAILURE
);
1711 /* This implicitly undoes the signal mask stuff we did before the fork()ing above */
1712 r
= reset_signal_mask();
1714 log_full_errno(prio
, r
, "Failed to reset signal mask: %m");
1715 _exit(EXIT_FAILURE
);
1717 } else if (block_signals
) { /* undo what we did above */
1718 if (sigprocmask(SIG_SETMASK
, &saved_ss
, NULL
) < 0) {
1719 log_full_errno(prio
, errno
, "Failed to restore signal mask: %m");
1720 _exit(EXIT_FAILURE
);
1724 if (flags
& (FORK_DEATHSIG_SIGTERM
|FORK_DEATHSIG_SIGKILL
|FORK_DEATHSIG_SIGINT
)) {
1726 /* Let's see if the parent PID is still the one we started from? If not, then the parent
1727 * already died by the time we set PR_SET_PDEATHSIG, hence let's emulate the effect */
1731 /* Parent is in a different PID namespace. */;
1732 else if (ppid
!= original_pid
) {
1733 int sig
= fork_flags_to_signal(flags
);
1734 log_debug("Parent died early, raising %s.", signal_to_string(sig
));
1736 _exit(EXIT_FAILURE
);
1740 if (FLAGS_SET(flags
, FORK_NEW_MOUNTNS
| FORK_MOUNTNS_SLAVE
)) {
1741 /* Optionally, make sure we never propagate mounts to the host. */
1742 if (mount(NULL
, "/", NULL
, MS_SLAVE
| MS_REC
, NULL
) < 0) {
1743 log_full_errno(prio
, errno
, "Failed to remount root directory as MS_SLAVE: %m");
1744 _exit(EXIT_FAILURE
);
1748 if (FLAGS_SET(flags
, FORK_PRIVATE_TMP
)) {
1749 assert(FLAGS_SET(flags
, FORK_NEW_MOUNTNS
));
1751 /* Optionally, overmount new tmpfs instance on /tmp/. */
1752 r
= mount_nofollow("tmpfs", "/tmp", "tmpfs",
1754 "mode=01777" TMPFS_LIMITS_RUN
);
1756 log_full_errno(prio
, r
, "Failed to overmount /tmp/: %m");
1757 _exit(EXIT_FAILURE
);
1761 if (flags
& FORK_REARRANGE_STDIO
) {
1763 r
= rearrange_stdio(stdio_fds
[0], stdio_fds
[1], stdio_fds
[2]);
1765 log_full_errno(prio
, r
, "Failed to rearrange stdio fds: %m");
1766 _exit(EXIT_FAILURE
);
1769 /* Turn off O_NONBLOCK on the fdio fds, in case it was left on */
1770 stdio_disable_nonblock();
1772 r
= make_null_stdio();
1774 log_full_errno(prio
, r
, "Failed to connect stdin/stdout to /dev/null: %m");
1775 _exit(EXIT_FAILURE
);
1778 } else if (flags
& FORK_STDOUT_TO_STDERR
) {
1779 if (dup2(STDERR_FILENO
, STDOUT_FILENO
) < 0) {
1780 log_full_errno(prio
, errno
, "Failed to connect stdout to stderr: %m");
1781 _exit(EXIT_FAILURE
);
1785 if (flags
& FORK_CLOSE_ALL_FDS
) {
1786 /* Close the logs here in case it got reopened above, as close_all_fds() would close them for us */
1789 r
= close_all_fds(except_fds
, n_except_fds
);
1791 log_full_errno(prio
, r
, "Failed to close all file descriptors: %m");
1792 _exit(EXIT_FAILURE
);
1796 if (flags
& FORK_PACK_FDS
) {
1797 /* FORK_CLOSE_ALL_FDS ensures that except_fds are the only FDs >= 3 that are
1798 * open, this is including the log. This is required by pack_fds, which will
1799 * get stuck in an infinite loop of any FDs other than except_fds are open. */
1800 assert(FLAGS_SET(flags
, FORK_CLOSE_ALL_FDS
));
1802 r
= pack_fds(except_fds
, n_except_fds
);
1804 log_full_errno(prio
, r
, "Failed to pack file descriptors: %m");
1805 _exit(EXIT_FAILURE
);
1809 if (flags
& FORK_CLOEXEC_OFF
) {
1810 r
= fd_cloexec_many(except_fds
, n_except_fds
, false);
1812 log_full_errno(prio
, r
, "Failed to turn off O_CLOEXEC on file descriptors: %m");
1813 _exit(EXIT_FAILURE
);
1817 /* When we were asked to reopen the logs, do so again now */
1818 if (flags
& FORK_REOPEN_LOG
) {
1820 log_set_open_when_needed(false);
1823 if (flags
& FORK_RLIMIT_NOFILE_SAFE
) {
1824 r
= rlimit_nofile_safe();
1826 log_full_errno(prio
, r
, "Failed to lower RLIMIT_NOFILE's soft limit to 1K: %m");
1827 _exit(EXIT_FAILURE
);
1831 if (!FLAGS_SET(flags
, FORK_KEEP_NOTIFY_SOCKET
)) {
1832 r
= RET_NERRNO(unsetenv("NOTIFY_SOCKET"));
1834 log_full_errno(prio
, r
, "Failed to unset $NOTIFY_SOCKET: %m");
1835 _exit(EXIT_FAILURE
);
1839 if (FLAGS_SET(flags
, FORK_FREEZE
))
1843 if (FLAGS_SET(flags
, FORK_PID_ONLY
))
1844 *ret_pid
= PIDREF_MAKE_FROM_PID(getpid_cached());
1846 r
= pidref_set_self(ret_pid
);
1848 log_full_errno(prio
, r
, "Failed to acquire PID reference on ourselves: %m");
1849 _exit(EXIT_FAILURE
);
1859 const int stdio_fds
[3],
1861 size_t n_except_fds
,
1865 _cleanup_(pidref_done
) PidRef pidref
= PIDREF_NULL
;
1868 /* Getting the detached child process pid without pidfd is racy, so don't allow it if not returning
1869 * a pidref to the caller. */
1870 assert(!FLAGS_SET(flags
, FORK_DETACH
) || !ret_pid
);
1872 r
= pidref_safe_fork_full(name
, stdio_fds
, except_fds
, n_except_fds
, flags
|FORK_PID_ONLY
, ret_pid
? &pidref
: NULL
);
1873 if (r
< 0 || !ret_pid
)
1876 *ret_pid
= pidref
.pid
;
1882 const char *outer_name
,
1883 const char *inner_name
,
1885 size_t n_except_fds
,
1896 /* This is much like safe_fork(), but forks twice, and joins the specified namespaces in the middle
1897 * process. This ensures that we are fully a member of the destination namespace, with pidns an all, so that
1898 * /proc/self/fd works correctly. */
1900 r
= safe_fork_full(outer_name
,
1902 except_fds
, n_except_fds
,
1903 (flags
|FORK_DEATHSIG_SIGINT
|FORK_DEATHSIG_SIGTERM
|FORK_DEATHSIG_SIGKILL
) & ~(FORK_REOPEN_LOG
|FORK_NEW_MOUNTNS
|FORK_MOUNTNS_SLAVE
), ret_pid
);
1911 r
= namespace_enter(pidns_fd
, mntns_fd
, netns_fd
, userns_fd
, root_fd
);
1913 log_full_errno(FLAGS_SET(flags
, FORK_LOG
) ? LOG_ERR
: LOG_DEBUG
, r
, "Failed to join namespace: %m");
1914 _exit(EXIT_FAILURE
);
1917 /* We mask a few flags here that either make no sense for the grandchild, or that we don't have to do again */
1918 r
= safe_fork_full(inner_name
,
1920 except_fds
, n_except_fds
,
1921 flags
& ~(FORK_WAIT
|FORK_RESET_SIGNALS
|FORK_CLOSE_ALL_FDS
|FORK_REARRANGE_STDIO
), &pid
);
1923 _exit(EXIT_FAILURE
);
1931 r
= wait_for_terminate_and_check(inner_name
, pid
, FLAGS_SET(flags
, FORK_LOG
) ? WAIT_LOG
: 0);
1933 _exit(EXIT_FAILURE
);
1941 int set_oom_score_adjust(int value
) {
1942 char t
[DECIMAL_STR_MAX(int)];
1944 if (!oom_score_adjust_is_valid(value
))
1947 xsprintf(t
, "%i", value
);
1949 return write_string_file("/proc/self/oom_score_adj", t
,
1950 WRITE_STRING_FILE_VERIFY_ON_FAILURE
|WRITE_STRING_FILE_DISABLE_BUFFER
);
1953 int get_oom_score_adjust(int *ret
) {
1954 _cleanup_free_
char *t
= NULL
;
1957 r
= read_virtual_file("/proc/self/oom_score_adj", SIZE_MAX
, &t
, NULL
);
1961 delete_trailing_chars(t
, WHITESPACE
);
1963 r
= safe_atoi(t
, &a
);
1967 if (!oom_score_adjust_is_valid(a
))
1976 static int rlimit_to_nice(rlim_t limit
) {
1978 return PRIO_MAX
-1; /* i.e. 19 */
1980 if (limit
>= -PRIO_MIN
+ PRIO_MAX
)
1981 return PRIO_MIN
; /* i.e. -20 */
1983 return PRIO_MAX
- (int) limit
;
1986 int setpriority_closest(int priority
) {
1987 struct rlimit highest
;
1988 int r
, current
, limit
;
1990 /* Try to set requested nice level */
1991 r
= RET_NERRNO(setpriority(PRIO_PROCESS
, 0, priority
));
1994 if (!ERRNO_IS_NEG_PRIVILEGE(r
))
1998 current
= getpriority(PRIO_PROCESS
, 0);
2002 if (priority
== current
)
2005 /* Hmm, we'd expect that raising the nice level from our status quo would always work. If it doesn't,
2006 * then the whole setpriority() system call is blocked to us, hence let's propagate the error
2008 if (priority
> current
)
2011 if (getrlimit(RLIMIT_NICE
, &highest
) < 0)
2014 limit
= rlimit_to_nice(highest
.rlim_cur
);
2016 /* Push to the allowed limit if we're higher than that. Note that we could also be less nice than
2017 * limit allows us, but still higher than what's requested. In that case our current value is
2018 * the best choice. */
2019 if (current
> limit
)
2020 if (setpriority(PRIO_PROCESS
, 0, limit
) < 0)
2023 log_debug("Cannot set requested nice level (%i), using next best (%i).", priority
, MIN(current
, limit
));
2027 _noreturn_
void freeze(void) {
2030 /* Make sure nobody waits for us (i.e. on one of our sockets) anymore. Note that we use
2031 * close_all_fds_without_malloc() instead of plain close_all_fds() here, since we want this function
2032 * to be compatible with being called from signal handlers. */
2033 (void) close_all_fds_without_malloc(NULL
, 0);
2035 /* Let's not freeze right away, but keep reaping zombies. */
2039 if (waitid(P_ALL
, 0, &si
, WEXITED
) < 0 && errno
!= EINTR
)
2043 /* waitid() failed with an ECHLD error (because there are no left-over child processes) or any other
2044 * (unexpected) error. Freeze for good now! */
2049 int get_process_threads(pid_t pid
) {
2050 _cleanup_free_
char *t
= NULL
;
2056 r
= procfs_file_get_field(pid
, "status", "Threads", &t
);
2062 r
= safe_atoi(t
, &n
);
2071 int is_reaper_process(void) {
2074 /* Checks if we are running in a reaper process, i.e. if we are expected to deal with processes
2075 * reparented to us. This simply checks if we are PID 1 or if PR_SET_CHILD_SUBREAPER was called. */
2077 if (getpid_cached() == 1)
2080 if (prctl(PR_GET_CHILD_SUBREAPER
, (unsigned long) &b
, 0UL, 0UL, 0UL) < 0)
2086 int make_reaper_process(bool b
) {
2088 if (getpid_cached() == 1) {
2096 /* Some prctl()s insist that all 5 arguments are specified, others do not. Let's always specify all,
2097 * to avoid any ambiguities */
2098 if (prctl(PR_SET_CHILD_SUBREAPER
, (unsigned long) b
, 0UL, 0UL, 0UL) < 0)
2104 DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(posix_spawnattr_t
*, posix_spawnattr_destroy
, NULL
);
2106 int posix_spawn_wrapper(
2111 PidRef
*ret_pidref
) {
2113 short flags
= POSIX_SPAWN_SETSIGMASK
;
2114 posix_spawnattr_t attr
;
2118 /* Forks and invokes 'path' with 'argv' and 'envp' using CLONE_VM and CLONE_VFORK, which means the
2119 * caller will be blocked until the child either exits or exec's. The memory of the child will be
2120 * fully shared with the memory of the parent, so that there are no copy-on-write or memory.max
2123 * Also, move the newly-created process into 'cgroup' through POSIX_SPAWN_SETCGROUP (clone3())
2125 * returns 1: We're already in the right cgroup
2126 * 0: 'cgroup' not specified or POSIX_SPAWN_SETCGROUP is not supported. The caller
2127 * needs to call 'cg_attach' on their own */
2133 assert_se(sigfillset(&mask
) >= 0);
2135 r
= posix_spawnattr_init(&attr
);
2137 return -r
; /* These functions return a positive errno on failure */
2139 /* Initialization needs to succeed before we can set up a destructor. */
2140 _unused_
_cleanup_(posix_spawnattr_destroyp
) posix_spawnattr_t
*attr_destructor
= &attr
;
2142 #if HAVE_PIDFD_SPAWN
2143 static bool have_clone_into_cgroup
= true; /* kernel 5.7+ */
2144 _cleanup_close_
int cgroup_fd
= -EBADF
;
2146 if (cgroup
&& have_clone_into_cgroup
) {
2147 _cleanup_free_
char *resolved_cgroup
= NULL
;
2149 r
= cg_get_path_and_check(
2150 SYSTEMD_CGROUP_CONTROLLER
,
2157 cgroup_fd
= open(resolved_cgroup
, O_PATH
|O_DIRECTORY
|O_CLOEXEC
);
2161 r
= posix_spawnattr_setcgroup_np(&attr
, cgroup_fd
);
2165 flags
|= POSIX_SPAWN_SETCGROUP
;
2169 r
= posix_spawnattr_setflags(&attr
, flags
);
2172 r
= posix_spawnattr_setsigmask(&attr
, &mask
);
2176 #if HAVE_PIDFD_SPAWN
2177 _cleanup_close_
int pidfd
= -EBADF
;
2179 r
= pidfd_spawn(&pidfd
, path
, NULL
, &attr
, argv
, envp
);
2180 if (ERRNO_IS_NOT_SUPPORTED(r
) && FLAGS_SET(flags
, POSIX_SPAWN_SETCGROUP
) && cg_is_threaded(cgroup
) > 0)
2181 return -EUCLEAN
; /* clone3() could also return EOPNOTSUPP if the target cgroup is in threaded mode,
2182 turn that into something recognizable */
2183 if ((ERRNO_IS_NOT_SUPPORTED(r
) || ERRNO_IS_PRIVILEGE(r
) || r
== E2BIG
) &&
2184 FLAGS_SET(flags
, POSIX_SPAWN_SETCGROUP
)) {
2185 /* Compiled on a newer host, or seccomp&friends blocking clone3()? Fallback, but
2186 * need to disable POSIX_SPAWN_SETCGROUP, which is what redirects to clone3().
2187 * Note that we might get E2BIG here since some kernels (e.g. 5.4) support clone3()
2188 * but not CLONE_INTO_CGROUP. */
2190 /* CLONE_INTO_CGROUP definitely won't work, hence remember the fact so that we don't
2191 * retry every time. */
2192 have_clone_into_cgroup
= false;
2194 flags
&= ~POSIX_SPAWN_SETCGROUP
;
2195 r
= posix_spawnattr_setflags(&attr
, flags
);
2199 r
= pidfd_spawn(&pidfd
, path
, NULL
, &attr
, argv
, envp
);
2204 r
= pidref_set_pidfd_consume(ret_pidref
, TAKE_FD(pidfd
));
2208 return FLAGS_SET(flags
, POSIX_SPAWN_SETCGROUP
);
2212 r
= posix_spawn(&pid
, path
, NULL
, &attr
, argv
, envp
);
2216 r
= pidref_set_pid(ret_pidref
, pid
);
2220 return 0; /* We did not use CLONE_INTO_CGROUP so return 0, the caller will have to move the child */
2224 int proc_dir_open(DIR **ret
) {
2229 d
= opendir("/proc");
2237 int proc_dir_read(DIR *d
, pid_t
*ret
) {
2244 de
= readdir_no_dot(d
);
2252 if (!IN_SET(de
->d_type
, DT_DIR
, DT_UNKNOWN
))
2255 if (parse_pid(de
->d_name
, ret
) >= 0)
2264 int proc_dir_read_pidref(DIR *d
, PidRef
*ret
) {
2272 r
= proc_dir_read(d
, &pid
);
2278 r
= pidref_set_pid(ret
, pid
);
2279 if (r
== -ESRCH
) /* gone by now? skip it */
2292 static const char *const sigchld_code_table
[] = {
2293 [CLD_EXITED
] = "exited",
2294 [CLD_KILLED
] = "killed",
2295 [CLD_DUMPED
] = "dumped",
2296 [CLD_TRAPPED
] = "trapped",
2297 [CLD_STOPPED
] = "stopped",
2298 [CLD_CONTINUED
] = "continued",
2301 DEFINE_STRING_TABLE_LOOKUP(sigchld_code
, int);
2303 static const char* const sched_policy_table
[] = {
2304 [SCHED_OTHER
] = "other",
2305 [SCHED_BATCH
] = "batch",
2306 [SCHED_IDLE
] = "idle",
2307 [SCHED_FIFO
] = "fifo",
2311 DEFINE_STRING_TABLE_LOOKUP_WITH_FALLBACK(sched_policy
, int, INT_MAX
);
2313 _noreturn_
void report_errno_and_exit(int errno_fd
, int error
) {
2317 _exit(EXIT_SUCCESS
);
2319 assert(errno_fd
>= 0);
2321 r
= loop_write(errno_fd
, &error
, sizeof(error
));
2323 log_debug_errno(r
, "Failed to write errno to errno_fd=%d: %m", errno_fd
);
2325 _exit(EXIT_FAILURE
);
2328 int read_errno(int errno_fd
) {
2331 assert(errno_fd
>= 0);
2333 /* The issue here is that it's impossible to distinguish between an error code returned by child and
2334 * IO error arose when reading it. So, the function logs errors and return EIO for the later case. */
2336 ssize_t n
= loop_read(errno_fd
, &r
, sizeof(r
), /* do_poll = */ false);
2338 log_debug_errno(n
, "Failed to read errno: %m");
2341 if (n
== sizeof(r
)) {
2344 if (r
< 0) /* child process reported an error, return it */
2345 return log_debug_errno(r
, "Child process failed with errno: %m");
2346 return log_debug_errno(SYNTHETIC_ERRNO(EIO
), "Received an errno, but it's a positive value.");
2349 return log_debug_errno(SYNTHETIC_ERRNO(EIO
), "Received unexpected amount of bytes while reading errno.");
2351 /* the process exited without reporting an error, assuming success */