1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
12 #include <sys/mount.h>
13 #include <sys/personality.h>
14 #include <sys/prctl.h>
15 #include <sys/types.h>
19 #if HAVE_VALGRIND_VALGRIND_H
20 #include <valgrind/valgrind.h>
23 #include "sd-messages.h"
25 #include "alloc-util.h"
26 #include "architecture.h"
27 #include "argv-util.h"
28 #include "cgroup-util.h"
29 #include "dirent-util.h"
32 #include "errno-util.h"
37 #include "hostname-util.h"
38 #include "locale-util.h"
41 #include "memory-util.h"
42 #include "missing_sched.h"
43 #include "missing_syscall.h"
44 #include "missing_threads.h"
45 #include "mountpoint-util.h"
46 #include "namespace-util.h"
47 #include "nulstr-util.h"
48 #include "parse-util.h"
49 #include "path-util.h"
50 #include "process-util.h"
51 #include "raw-clone.h"
52 #include "rlimit-util.h"
53 #include "signal-util.h"
54 #include "stat-util.h"
55 #include "stdio-util.h"
56 #include "string-table.h"
57 #include "string-util.h"
58 #include "terminal-util.h"
59 #include "user-util.h"
62 /* The kernel limits userspace processes to TASK_COMM_LEN (16 bytes), but allows higher values for its own
63 * workers, e.g. "kworker/u9:3-kcryptd/253:0". Let's pick a fixed smallish limit that will work for the kernel.
65 #define COMM_MAX_LEN 128
67 static int get_process_state(pid_t pid
) {
68 _cleanup_free_
char *line
= NULL
;
75 /* Shortcut: if we are enquired about our own state, we are obviously running */
76 if (pid
== 0 || pid
== getpid_cached())
77 return (unsigned char) 'R';
79 p
= procfs_file_alloca(pid
, "stat");
81 r
= read_one_line_file(p
, &line
);
87 p
= strrchr(line
, ')');
93 if (sscanf(p
, " %c", &state
) != 1)
96 return (unsigned char) state
;
99 int pid_get_comm(pid_t pid
, char **ret
) {
100 _cleanup_free_
char *escaped
= NULL
, *comm
= NULL
;
106 if (pid
== 0 || pid
== getpid_cached()) {
107 comm
= new0(char, TASK_COMM_LEN
+ 1); /* Must fit in 16 byte according to prctl(2) */
111 if (prctl(PR_GET_NAME
, comm
) < 0)
116 p
= procfs_file_alloca(pid
, "comm");
118 /* Note that process names of kernel threads can be much longer than TASK_COMM_LEN */
119 r
= read_one_line_file(p
, &comm
);
126 escaped
= new(char, COMM_MAX_LEN
);
130 /* Escape unprintable characters, just in case, but don't grow the string beyond the underlying size */
131 cellescape(escaped
, COMM_MAX_LEN
, comm
);
133 *ret
= TAKE_PTR(escaped
);
137 int pidref_get_comm(const PidRef
*pid
, char **ret
) {
138 _cleanup_free_
char *comm
= NULL
;
141 if (!pidref_is_set(pid
))
144 r
= pid_get_comm(pid
->pid
, &comm
);
148 r
= pidref_verify(pid
);
153 *ret
= TAKE_PTR(comm
);
157 static int pid_get_cmdline_nulstr(
160 ProcessCmdlineFlags flags
,
164 _cleanup_free_
char *t
= NULL
;
169 /* Retrieves a process' command line as a "sized nulstr", i.e. possibly without the last NUL, but
170 * with a specified size.
172 * If PROCESS_CMDLINE_COMM_FALLBACK is specified in flags and the process has no command line set
173 * (the case for kernel threads), or has a command line that resolves to the empty string, will
174 * return the "comm" name of the process instead. This will use at most _SC_ARG_MAX bytes of input
177 * Returns an error, 0 if output was read but is truncated, 1 otherwise.
180 p
= procfs_file_alloca(pid
, "cmdline");
181 r
= read_virtual_file(p
, max_size
, &t
, &k
); /* Let's assume that each input byte results in >= 1
182 * columns of output. We ignore zero-width codepoints. */
189 if (!(flags
& PROCESS_CMDLINE_COMM_FALLBACK
))
192 /* Kernel threads have no argv[] */
193 _cleanup_free_
char *comm
= NULL
;
195 r
= pid_get_comm(pid
, &comm
);
200 t
= strjoin("[", comm
, "]");
206 if (r
== 0) /* truncation */
218 int pid_get_cmdline(pid_t pid
, size_t max_columns
, ProcessCmdlineFlags flags
, char **ret
) {
219 _cleanup_free_
char *t
= NULL
;
226 /* Retrieve and format a command line. See above for discussion of retrieval options.
228 * There are two main formatting modes:
230 * - when PROCESS_CMDLINE_QUOTE is specified, output is quoted in C/Python style. If no shell special
231 * characters are present, this output can be copy-pasted into the terminal to execute. UTF-8
234 * - otherwise, a compact non-roundtrippable form is returned. Non-UTF8 bytes are replaced by �. The
235 * returned string is of the specified console width at most, abbreviated with an ellipsis.
237 * Returns -ESRCH if the process doesn't exist, and -ENOENT if the process has no command line (and
238 * PROCESS_CMDLINE_COMM_FALLBACK is not specified). Returns 0 and sets *line otherwise. */
240 int full
= pid_get_cmdline_nulstr(pid
, max_columns
, flags
, &t
, &k
);
244 if (flags
& (PROCESS_CMDLINE_QUOTE
| PROCESS_CMDLINE_QUOTE_POSIX
)) {
245 ShellEscapeFlags shflags
= SHELL_ESCAPE_EMPTY
|
246 FLAGS_SET(flags
, PROCESS_CMDLINE_QUOTE_POSIX
) * SHELL_ESCAPE_POSIX
;
248 assert(!(flags
& PROCESS_CMDLINE_USE_LOCALE
));
250 _cleanup_strv_free_
char **args
= NULL
;
252 /* Drop trailing NULs, otherwise strv_parse_nulstr() adds additional empty strings at the end.
253 * See also issue #21186. */
254 args
= strv_parse_nulstr_full(t
, k
, /* drop_trailing_nuls = */ true);
258 ans
= quote_command_line(args
, shflags
);
262 /* Arguments are separated by NULs. Let's replace those with spaces. */
263 for (size_t i
= 0; i
< k
- 1; i
++)
267 delete_trailing_chars(t
, WHITESPACE
);
269 bool eight_bit
= (flags
& PROCESS_CMDLINE_USE_LOCALE
) && !is_locale_utf8();
271 ans
= escape_non_printable_full(t
, max_columns
,
272 eight_bit
* XESCAPE_8_BIT
| !full
* XESCAPE_FORCE_ELLIPSIS
);
276 ans
= str_realloc(ans
);
283 int pidref_get_cmdline(const PidRef
*pid
, size_t max_columns
, ProcessCmdlineFlags flags
, char **ret
) {
284 _cleanup_free_
char *s
= NULL
;
287 if (!pidref_is_set(pid
))
290 r
= pid_get_cmdline(pid
->pid
, max_columns
, flags
, &s
);
294 r
= pidref_verify(pid
);
303 int pid_get_cmdline_strv(pid_t pid
, ProcessCmdlineFlags flags
, char ***ret
) {
304 _cleanup_free_
char *t
= NULL
;
310 assert((flags
& ~PROCESS_CMDLINE_COMM_FALLBACK
) == 0);
313 r
= pid_get_cmdline_nulstr(pid
, SIZE_MAX
, flags
, &t
, &k
);
317 args
= strv_parse_nulstr_full(t
, k
, /* drop_trailing_nuls = */ true);
325 int pidref_get_cmdline_strv(const PidRef
*pid
, ProcessCmdlineFlags flags
, char ***ret
) {
326 _cleanup_strv_free_
char **args
= NULL
;
329 if (!pidref_is_set(pid
))
332 r
= pid_get_cmdline_strv(pid
->pid
, flags
, &args
);
336 r
= pidref_verify(pid
);
341 *ret
= TAKE_PTR(args
);
346 int container_get_leader(const char *machine
, pid_t
*pid
) {
347 _cleanup_free_
char *s
= NULL
, *class = NULL
;
355 if (streq(machine
, ".host")) {
360 if (!hostname_is_valid(machine
, 0))
363 p
= strjoina("/run/systemd/machines/", machine
);
364 r
= parse_env_file(NULL
, p
,
374 if (!streq_ptr(class, "container"))
377 r
= parse_pid(s
, &leader
);
387 int namespace_get_leader(pid_t pid
, NamespaceType type
, pid_t
*ret
) {
395 r
= get_process_ppid(pid
, &ppid
);
399 r
= in_same_namespace(pid
, ppid
, type
);
403 /* If the parent and the child are not in the same
404 * namespace, then the child is the leader we are
414 int pid_is_kernel_thread(pid_t pid
) {
415 _cleanup_free_
char *line
= NULL
;
416 unsigned long long flags
;
422 if (IN_SET(pid
, 0, 1) || pid
== getpid_cached()) /* pid 1, and we ourselves certainly aren't a kernel thread */
424 if (!pid_is_valid(pid
))
427 p
= procfs_file_alloca(pid
, "stat");
428 r
= read_one_line_file(p
, &line
);
434 /* Skip past the comm field */
435 q
= strrchr(line
, ')');
440 /* Skip 6 fields to reach the flags field */
441 for (i
= 0; i
< 6; i
++) {
442 l
= strspn(q
, WHITESPACE
);
447 l
= strcspn(q
, WHITESPACE
);
453 /* Skip preceding whitespace */
454 l
= strspn(q
, WHITESPACE
);
459 /* Truncate the rest */
460 l
= strcspn(q
, WHITESPACE
);
465 r
= safe_atollu(q
, &flags
);
469 return !!(flags
& PF_KTHREAD
);
472 int pidref_is_kernel_thread(const PidRef
*pid
) {
475 if (!pidref_is_set(pid
))
478 result
= pid_is_kernel_thread(pid
->pid
);
482 r
= pidref_verify(pid
); /* Verify that the PID wasn't reused since */
489 int get_process_capeff(pid_t pid
, char **ret
) {
496 p
= procfs_file_alloca(pid
, "status");
498 r
= get_proc_field(p
, "CapEff", WHITESPACE
, ret
);
505 static int get_process_link_contents(pid_t pid
, const char *proc_file
, char **ret
) {
511 p
= procfs_file_alloca(pid
, proc_file
);
513 r
= readlink_malloc(p
, ret
);
514 return (r
== -ENOENT
&& proc_mounted() > 0) ? -ESRCH
: r
;
517 int get_process_exe(pid_t pid
, char **ret
) {
523 r
= get_process_link_contents(pid
, "exe", ret
);
528 d
= endswith(*ret
, " (deleted)");
536 static int get_process_id(pid_t pid
, const char *field
, uid_t
*ret
) {
537 _cleanup_fclose_
FILE *f
= NULL
;
547 p
= procfs_file_alloca(pid
, "status");
548 r
= fopen_unlocked(p
, "re", &f
);
555 _cleanup_free_
char *line
= NULL
;
558 r
= read_stripped_line(f
, LONG_LINE_MAX
, &line
);
564 l
= startswith(line
, field
);
566 l
+= strspn(l
, WHITESPACE
);
568 l
[strcspn(l
, WHITESPACE
)] = 0;
570 return parse_uid(l
, ret
);
577 int pid_get_uid(pid_t pid
, uid_t
*ret
) {
580 if (pid
== 0 || pid
== getpid_cached()) {
585 return get_process_id(pid
, "Uid:", ret
);
588 int pidref_get_uid(const PidRef
*pid
, uid_t
*ret
) {
592 if (!pidref_is_set(pid
))
595 r
= pid_get_uid(pid
->pid
, &uid
);
599 r
= pidref_verify(pid
);
608 int get_process_gid(pid_t pid
, gid_t
*ret
) {
610 if (pid
== 0 || pid
== getpid_cached()) {
615 assert_cc(sizeof(uid_t
) == sizeof(gid_t
));
616 return get_process_id(pid
, "Gid:", ret
);
619 int get_process_cwd(pid_t pid
, char **ret
) {
622 if (pid
== 0 || pid
== getpid_cached())
623 return safe_getcwd(ret
);
625 return get_process_link_contents(pid
, "cwd", ret
);
628 int get_process_root(pid_t pid
, char **ret
) {
630 return get_process_link_contents(pid
, "root", ret
);
633 #define ENVIRONMENT_BLOCK_MAX (5U*1024U*1024U)
635 int get_process_environ(pid_t pid
, char **ret
) {
636 _cleanup_fclose_
FILE *f
= NULL
;
637 _cleanup_free_
char *outcome
= NULL
;
645 p
= procfs_file_alloca(pid
, "environ");
647 r
= fopen_unlocked(p
, "re", &f
);
656 if (sz
>= ENVIRONMENT_BLOCK_MAX
)
659 if (!GREEDY_REALLOC(outcome
, sz
+ 5))
662 r
= safe_fgetc(f
, &c
);
669 outcome
[sz
++] = '\n';
671 sz
+= cescape_char(c
, outcome
+ sz
);
675 *ret
= TAKE_PTR(outcome
);
680 int get_process_ppid(pid_t pid
, pid_t
*ret
) {
681 _cleanup_free_
char *line
= NULL
;
688 if (pid
== 0 || pid
== getpid_cached()) {
694 if (pid
== 1) /* PID 1 has no parent, shortcut this case */
695 return -EADDRNOTAVAIL
;
697 p
= procfs_file_alloca(pid
, "stat");
698 r
= read_one_line_file(p
, &line
);
704 /* Let's skip the pid and comm fields. The latter is enclosed in () but does not escape any () in its
705 * value, so let's skip over it manually */
707 p
= strrchr(line
, ')');
719 /* If ppid is zero the process has no parent. Which might be the case for PID 1 but also for
720 * processes originating in other namespaces that are inserted into a pidns. Return a recognizable
721 * error in this case. */
723 return -EADDRNOTAVAIL
;
725 if ((pid_t
) ppid
< 0 || (unsigned long) (pid_t
) ppid
!= ppid
)
734 int pid_get_start_time(pid_t pid
, uint64_t *ret
) {
735 _cleanup_free_
char *line
= NULL
;
741 p
= procfs_file_alloca(pid
, "stat");
742 r
= read_one_line_file(p
, &line
);
748 /* Let's skip the pid and comm fields. The latter is enclosed in () but does not escape any () in its
749 * value, so let's skip over it manually */
751 p
= strrchr(line
, ')');
775 "%*i " /* priority */
777 "%*u " /* num_threads */
778 "%*u " /* itrealvalue */
779 "%lu ", /* starttime */
789 int pidref_get_start_time(const PidRef
*pid
, uint64_t *ret
) {
793 if (!pidref_is_set(pid
))
796 r
= pid_get_start_time(pid
->pid
, ret
? &t
: NULL
);
800 r
= pidref_verify(pid
);
810 int get_process_umask(pid_t pid
, mode_t
*ret
) {
811 _cleanup_free_
char *m
= NULL
;
818 p
= procfs_file_alloca(pid
, "status");
820 r
= get_proc_field(p
, "Umask", WHITESPACE
, &m
);
826 return parse_mode(m
, ret
);
829 int wait_for_terminate(pid_t pid
, siginfo_t
*status
) {
840 if (waitid(P_PID
, pid
, status
, WEXITED
) < 0) {
845 return negative_errno();
854 * < 0 : wait_for_terminate() failed to get the state of the
855 * process, the process was terminated by a signal, or
856 * failed for an unknown reason.
857 * >=0 : The process terminated normally, and its exit code is
860 * That is, success is indicated by a return value of zero, and an
861 * error is indicated by a non-zero value.
863 * A warning is emitted if the process terminates abnormally,
864 * and also if it returns non-zero unless check_exit_code is true.
866 int wait_for_terminate_and_check(const char *name
, pid_t pid
, WaitFlags flags
) {
867 _cleanup_free_
char *buffer
= NULL
;
874 r
= pid_get_comm(pid
, &buffer
);
876 log_debug_errno(r
, "Failed to acquire process name of " PID_FMT
", ignoring: %m", pid
);
881 prio
= flags
& WAIT_LOG_ABNORMAL
? LOG_ERR
: LOG_DEBUG
;
883 r
= wait_for_terminate(pid
, &status
);
885 return log_full_errno(prio
, r
, "Failed to wait for %s: %m", strna(name
));
887 if (status
.si_code
== CLD_EXITED
) {
888 if (status
.si_status
!= EXIT_SUCCESS
)
889 log_full(flags
& WAIT_LOG_NON_ZERO_EXIT_STATUS
? LOG_ERR
: LOG_DEBUG
,
890 "%s failed with exit status %i.", strna(name
), status
.si_status
);
892 log_debug("%s succeeded.", name
);
894 return status
.si_status
;
896 } else if (IN_SET(status
.si_code
, CLD_KILLED
, CLD_DUMPED
)) {
898 log_full(prio
, "%s terminated by signal %s.", strna(name
), signal_to_string(status
.si_status
));
902 log_full(prio
, "%s failed due to unknown reason.", strna(name
));
909 * < 0 : wait_for_terminate_with_timeout() failed to get the state of the process, the process timed out, the process
910 * was terminated by a signal, or failed for an unknown reason.
912 * >=0 : The process terminated normally with no failures.
914 * Success is indicated by a return value of zero, a timeout is indicated by ETIMEDOUT, and all other child failure
915 * states are indicated by error is indicated by a non-zero value.
917 * This call assumes SIGCHLD has been blocked already, in particular before the child to wait for has been forked off
918 * to remain entirely race-free.
920 int wait_for_terminate_with_timeout(pid_t pid
, usec_t timeout
) {
925 assert_se(sigemptyset(&mask
) == 0);
926 assert_se(sigaddset(&mask
, SIGCHLD
) == 0);
928 /* Drop into a sigtimewait-based timeout. Waiting for the
930 until
= usec_add(now(CLOCK_MONOTONIC
), timeout
);
933 siginfo_t status
= {};
935 n
= now(CLOCK_MONOTONIC
);
939 r
= RET_NERRNO(sigtimedwait(&mask
, NULL
, TIMESPEC_STORE(until
- n
)));
940 /* Assuming we woke due to the child exiting. */
941 if (waitid(P_PID
, pid
, &status
, WEXITED
|WNOHANG
) == 0) {
942 if (status
.si_pid
== pid
) {
943 /* This is the correct child. */
944 if (status
.si_code
== CLD_EXITED
)
945 return status
.si_status
== 0 ? 0 : -EPROTO
;
950 /* Not the child, check for errors and proceed appropriately */
954 /* Timed out, child is likely hung. */
957 /* Received a different signal and should retry */
960 /* Return any unexpected errors */
969 void sigkill_wait(pid_t pid
) {
972 (void) kill(pid
, SIGKILL
);
973 (void) wait_for_terminate(pid
, NULL
);
976 void sigkill_waitp(pid_t
*pid
) {
987 void sigterm_wait(pid_t pid
) {
990 (void) kill_and_sigcont(pid
, SIGTERM
);
991 (void) wait_for_terminate(pid
, NULL
);
994 void sigkill_nowait(pid_t pid
) {
997 (void) kill(pid
, SIGKILL
);
1000 void sigkill_nowaitp(pid_t
*pid
) {
1008 sigkill_nowait(*pid
);
1011 int kill_and_sigcont(pid_t pid
, int sig
) {
1014 r
= RET_NERRNO(kill(pid
, sig
));
1016 /* If this worked, also send SIGCONT, unless we already just sent a SIGCONT, or SIGKILL was sent which isn't
1017 * affected by a process being suspended anyway. */
1018 if (r
>= 0 && !IN_SET(sig
, SIGCONT
, SIGKILL
))
1019 (void) kill(pid
, SIGCONT
);
1024 int getenv_for_pid(pid_t pid
, const char *field
, char **ret
) {
1025 _cleanup_fclose_
FILE *f
= NULL
;
1034 if (pid
== 0 || pid
== getpid_cached())
1035 return strdup_to_full(ret
, getenv(field
));
1037 if (!pid_is_valid(pid
))
1040 path
= procfs_file_alloca(pid
, "environ");
1042 r
= fopen_unlocked(path
, "re", &f
);
1049 _cleanup_free_
char *line
= NULL
;
1052 if (sum
> ENVIRONMENT_BLOCK_MAX
) /* Give up searching eventually */
1055 r
= read_nul_string(f
, LONG_LINE_MAX
, &line
);
1058 if (r
== 0) /* EOF */
1063 match
= startswith(line
, field
);
1064 if (match
&& *match
== '=')
1065 return strdup_to_full(ret
, match
+ 1);
1072 int pid_is_my_child(pid_t pid
) {
1082 r
= get_process_ppid(pid
, &ppid
);
1086 return ppid
== getpid_cached();
1089 int pidref_is_my_child(const PidRef
*pid
) {
1092 if (!pidref_is_set(pid
))
1095 result
= pid_is_my_child(pid
->pid
);
1099 r
= pidref_verify(pid
);
1106 int pid_is_unwaited(pid_t pid
) {
1107 /* Checks whether a PID is still valid at all, including a zombie */
1112 if (pid
<= 1) /* If we or PID 1 would be dead and have been waited for, this code would not be running */
1115 if (pid
== getpid_cached())
1118 if (kill(pid
, 0) >= 0)
1121 return errno
!= ESRCH
;
1124 int pidref_is_unwaited(const PidRef
*pid
) {
1127 if (!pidref_is_set(pid
))
1130 if (pid
->pid
== 1 || pidref_is_self(pid
))
1133 r
= pidref_kill(pid
, 0);
1142 int pid_is_alive(pid_t pid
) {
1145 /* Checks whether a PID is still valid and not a zombie */
1150 if (pid
<= 1) /* If we or PID 1 would be a zombie, this code would not be running */
1153 if (pid
== getpid_cached())
1156 r
= get_process_state(pid
);
1165 int pidref_is_alive(const PidRef
*pidref
) {
1168 if (!pidref_is_set(pidref
))
1171 result
= pid_is_alive(pidref
->pid
);
1173 assert(result
!= -ESRCH
);
1177 r
= pidref_verify(pidref
);
1186 int pid_from_same_root_fs(pid_t pid
) {
1192 if (pid
== 0 || pid
== getpid_cached())
1195 root
= procfs_file_alloca(pid
, "root");
1197 return inode_same(root
, "/proc/1/root", 0);
1200 bool is_main_thread(void) {
1201 static thread_local
int cached
= 0;
1203 if (_unlikely_(cached
== 0))
1204 cached
= getpid_cached() == gettid() ? 1 : -1;
1209 bool oom_score_adjust_is_valid(int oa
) {
1210 return oa
>= OOM_SCORE_ADJ_MIN
&& oa
<= OOM_SCORE_ADJ_MAX
;
1213 unsigned long personality_from_string(const char *p
) {
1214 Architecture architecture
;
1217 return PERSONALITY_INVALID
;
1219 /* Parse a personality specifier. We use our own identifiers that indicate specific ABIs, rather than just
1220 * hints regarding the register size, since we want to keep things open for multiple locally supported ABIs for
1221 * the same register size. */
1223 architecture
= architecture_from_string(p
);
1224 if (architecture
< 0)
1225 return PERSONALITY_INVALID
;
1227 if (architecture
== native_architecture())
1229 #ifdef ARCHITECTURE_SECONDARY
1230 if (architecture
== ARCHITECTURE_SECONDARY
)
1234 return PERSONALITY_INVALID
;
1237 const char* personality_to_string(unsigned long p
) {
1238 Architecture architecture
= _ARCHITECTURE_INVALID
;
1241 architecture
= native_architecture();
1242 #ifdef ARCHITECTURE_SECONDARY
1243 else if (p
== PER_LINUX32
)
1244 architecture
= ARCHITECTURE_SECONDARY
;
1247 if (architecture
< 0)
1250 return architecture_to_string(architecture
);
1253 int safe_personality(unsigned long p
) {
1256 /* So here's the deal, personality() is weirdly defined by glibc. In some cases it returns a failure via errno,
1257 * and in others as negative return value containing an errno-like value. Let's work around this: this is a
1258 * wrapper that uses errno if it is set, and uses the return value otherwise. And then it sets both errno and
1259 * the return value indicating the same issue, so that we are definitely on the safe side.
1261 * See https://github.com/systemd/systemd/issues/6737 */
1264 ret
= personality(p
);
1275 int opinionated_personality(unsigned long *ret
) {
1278 /* Returns the current personality, or PERSONALITY_INVALID if we can't determine it. This function is a bit
1279 * opinionated though, and ignores all the finer-grained bits and exotic personalities, only distinguishing the
1280 * two most relevant personalities: PER_LINUX and PER_LINUX32. */
1282 current
= safe_personality(PERSONALITY_INVALID
);
1286 if (((unsigned long) current
& OPINIONATED_PERSONALITY_MASK
) == PER_LINUX32
)
1294 void valgrind_summary_hack(void) {
1295 #if HAVE_VALGRIND_VALGRIND_H
1296 if (getpid_cached() == 1 && RUNNING_ON_VALGRIND
) {
1298 pid
= raw_clone(SIGCHLD
);
1302 "MESSAGE_ID=" SD_MESSAGE_VALGRIND_HELPER_FORK_STR
,
1303 LOG_MESSAGE( "Failed to fork off valgrind helper: %m"));
1307 log_info("Spawned valgrind helper as PID "PID_FMT
".", pid
);
1308 (void) wait_for_terminate(pid
, NULL
);
1314 int pid_compare_func(const pid_t
*a
, const pid_t
*b
) {
1315 /* Suitable for usage in qsort() */
1319 /* The cached PID, possible values:
1321 * == UNSET [0] → cache not initialized yet
1322 * == BUSY [-1] → some thread is initializing it at the moment
1323 * any other → the cached PID
1326 #define CACHED_PID_UNSET ((pid_t) 0)
1327 #define CACHED_PID_BUSY ((pid_t) -1)
1329 static pid_t cached_pid
= CACHED_PID_UNSET
;
1331 void reset_cached_pid(void) {
1332 /* Invoked in the child after a fork(), i.e. at the first moment the PID changed */
1333 cached_pid
= CACHED_PID_UNSET
;
1336 pid_t
getpid_cached(void) {
1337 static bool installed
= false;
1338 pid_t current_value
= CACHED_PID_UNSET
;
1340 /* getpid_cached() is much like getpid(), but caches the value in local memory, to avoid having to invoke a
1341 * system call each time. This restores glibc behaviour from before 2.24, when getpid() was unconditionally
1342 * cached. Starting with 2.24 getpid() started to become prohibitively expensive when used for detecting when
1343 * objects were used across fork()s. With this caching the old behaviour is somewhat restored.
1345 * https://bugzilla.redhat.com/show_bug.cgi?id=1443976
1346 * https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=c579f48edba88380635ab98cb612030e3ed8691e
1349 (void) __atomic_compare_exchange_n(
1357 switch (current_value
) {
1359 case CACHED_PID_UNSET
: { /* Not initialized yet, then do so now */
1362 new_pid
= raw_getpid();
1365 /* __register_atfork() either returns 0 or -ENOMEM, in its glibc implementation. Since it's
1366 * only half-documented (glibc doesn't document it but LSB does — though only superficially)
1367 * we'll check for errors only in the most generic fashion possible. */
1369 if (pthread_atfork(NULL
, NULL
, reset_cached_pid
) != 0) {
1370 /* OOM? Let's try again later */
1371 cached_pid
= CACHED_PID_UNSET
;
1378 cached_pid
= new_pid
;
1382 case CACHED_PID_BUSY
: /* Somebody else is currently initializing */
1383 return raw_getpid();
1385 default: /* Properly initialized */
1386 return current_value
;
1390 int must_be_root(void) {
1395 return log_error_errno(SYNTHETIC_ERRNO(EPERM
), "Need to be root.");
1398 static void restore_sigsetp(sigset_t
**ssp
) {
1400 (void) sigprocmask(SIG_SETMASK
, *ssp
, NULL
);
1403 pid_t
clone_with_nested_stack(int (*fn
)(void *), int flags
, void *userdata
) {
1408 /* A wrapper around glibc's clone() call that automatically sets up a "nested" stack. Only supports
1409 * invocations without CLONE_VM, so that we can continue to use the parent's stack mapping.
1411 * Note: glibc's clone() wrapper does not synchronize malloc() locks. This means that if the parent
1412 * is threaded these locks will be in an undefined state in the child, and hence memory allocations
1413 * are likely going to run into deadlocks. Hence: if you use this function make sure your parent is
1414 * strictly single-threaded or your child never calls malloc(). */
1416 assert((flags
& (CLONE_VM
|CLONE_PARENT_SETTID
|CLONE_CHILD_SETTID
|
1417 CLONE_CHILD_CLEARTID
|CLONE_SETTLS
)) == 0);
1419 /* We allocate some space on the stack to use as the stack for the child (hence "nested"). Note that
1420 * the net effect is that the child will have the start of its stack inside the stack of the parent,
1421 * but since they are a CoW copy of each other that's fine. We allocate one page-aligned page. But
1422 * since we don't want to deal with differences between systems where the stack grows backwards or
1423 * forwards we'll allocate one more and place the stack address in the middle. Except that we also
1424 * want it page aligned, hence we'll allocate one page more. Makes 3. */
1427 mystack
= alloca(ps
*3);
1428 mystack
= (uint8_t*) mystack
+ ps
; /* move pointer one page ahead since stacks usually grow backwards */
1429 mystack
= (void*) ALIGN_TO((uintptr_t) mystack
, ps
); /* align to page size (moving things further ahead) */
1432 pid
= clone(fn
, mystack
, flags
, userdata
);
1434 pid
= __clone2(fn
, mystack
, ps
, flags
, userdata
);
1442 static int fork_flags_to_signal(ForkFlags flags
) {
1443 return (flags
& FORK_DEATHSIG_SIGTERM
) ? SIGTERM
:
1444 (flags
& FORK_DEATHSIG_SIGINT
) ? SIGINT
:
1450 const int stdio_fds
[3],
1452 size_t n_except_fds
,
1456 pid_t original_pid
, pid
;
1457 sigset_t saved_ss
, ss
;
1458 _unused_
_cleanup_(restore_sigsetp
) sigset_t
*saved_ssp
= NULL
;
1459 bool block_signals
= false, block_all
= false, intermediary
= false;
1462 assert(!FLAGS_SET(flags
, FORK_DETACH
) || !ret_pid
);
1463 assert(!FLAGS_SET(flags
, FORK_DETACH
|FORK_WAIT
));
1465 /* A wrapper around fork(), that does a couple of important initializations in addition to mere forking. Always
1466 * returns the child's PID in *ret_pid. Returns == 0 in the child, and > 0 in the parent. */
1468 prio
= flags
& FORK_LOG
? LOG_ERR
: LOG_DEBUG
;
1470 original_pid
= getpid_cached();
1472 if (flags
& FORK_FLUSH_STDIO
) {
1474 fflush(stderr
); /* This one shouldn't be necessary, stderr should be unbuffered anyway, but let's better be safe than sorry */
1477 if (flags
& (FORK_RESET_SIGNALS
|FORK_DEATHSIG_SIGTERM
|FORK_DEATHSIG_SIGINT
)) {
1478 /* We temporarily block all signals, so that the new child has them blocked initially. This
1479 * way, we can be sure that SIGTERMs are not lost we might send to the child. (Note that for
1480 * FORK_DEATHSIG_SIGKILL we don't bother, since it cannot be blocked anyway.) */
1482 assert_se(sigfillset(&ss
) >= 0);
1483 block_signals
= block_all
= true;
1485 } else if (flags
& FORK_WAIT
) {
1486 /* Let's block SIGCHLD at least, so that we can safely watch for the child process */
1488 assert_se(sigemptyset(&ss
) >= 0);
1489 assert_se(sigaddset(&ss
, SIGCHLD
) >= 0);
1490 block_signals
= true;
1493 if (block_signals
) {
1494 if (sigprocmask(SIG_SETMASK
, &ss
, &saved_ss
) < 0)
1495 return log_full_errno(prio
, errno
, "Failed to set signal mask: %m");
1496 saved_ssp
= &saved_ss
;
1499 if (FLAGS_SET(flags
, FORK_DETACH
)) {
1500 assert(!FLAGS_SET(flags
, FORK_WAIT
));
1503 /* Fork off intermediary child if needed */
1505 r
= is_reaper_process();
1507 return log_full_errno(prio
, r
, "Failed to determine if we are a reaper process: %m");
1510 /* Not a reaper process, hence do a double fork() so we are reparented to one */
1514 return log_full_errno(prio
, errno
, "Failed to fork off '%s': %m", strna(name
));
1516 log_debug("Successfully forked off intermediary '%s' as PID " PID_FMT
".", strna(name
), pid
);
1517 return 1; /* return in the parent */
1520 intermediary
= true;
1524 if ((flags
& (FORK_NEW_MOUNTNS
|FORK_NEW_USERNS
|FORK_NEW_NETNS
)) != 0)
1525 pid
= raw_clone(SIGCHLD
|
1526 (FLAGS_SET(flags
, FORK_NEW_MOUNTNS
) ? CLONE_NEWNS
: 0) |
1527 (FLAGS_SET(flags
, FORK_NEW_USERNS
) ? CLONE_NEWUSER
: 0) |
1528 (FLAGS_SET(flags
, FORK_NEW_NETNS
) ? CLONE_NEWNET
: 0));
1532 return log_full_errno(prio
, errno
, "Failed to fork off '%s': %m", strna(name
));
1535 /* If we are in the intermediary process, exit now */
1537 _exit(EXIT_SUCCESS
);
1539 /* We are in the parent process */
1540 log_debug("Successfully forked off '%s' as PID " PID_FMT
".", strna(name
), pid
);
1542 if (flags
& FORK_WAIT
) {
1544 /* undo everything except SIGCHLD */
1546 assert_se(sigaddset(&ss
, SIGCHLD
) >= 0);
1547 (void) sigprocmask(SIG_SETMASK
, &ss
, NULL
);
1550 r
= wait_for_terminate_and_check(name
, pid
, (flags
& FORK_LOG
? WAIT_LOG
: 0));
1553 if (r
!= EXIT_SUCCESS
) /* exit status > 0 should be treated as failure, too */
1563 /* We are in the child process */
1565 /* Restore signal mask manually */
1568 if (flags
& FORK_REOPEN_LOG
) {
1569 /* Close the logs if requested, before we log anything. And make sure we reopen it if needed. */
1571 log_set_open_when_needed(true);
1572 log_settle_target();
1576 r
= rename_process(name
);
1578 log_full_errno(flags
& FORK_LOG
? LOG_WARNING
: LOG_DEBUG
,
1579 r
, "Failed to rename process, ignoring: %m");
1582 if (flags
& (FORK_DEATHSIG_SIGTERM
|FORK_DEATHSIG_SIGINT
|FORK_DEATHSIG_SIGKILL
))
1583 if (prctl(PR_SET_PDEATHSIG
, fork_flags_to_signal(flags
)) < 0) {
1584 log_full_errno(prio
, errno
, "Failed to set death signal: %m");
1585 _exit(EXIT_FAILURE
);
1588 if (flags
& FORK_RESET_SIGNALS
) {
1589 r
= reset_all_signal_handlers();
1591 log_full_errno(prio
, r
, "Failed to reset signal handlers: %m");
1592 _exit(EXIT_FAILURE
);
1595 /* This implicitly undoes the signal mask stuff we did before the fork()ing above */
1596 r
= reset_signal_mask();
1598 log_full_errno(prio
, r
, "Failed to reset signal mask: %m");
1599 _exit(EXIT_FAILURE
);
1601 } else if (block_signals
) { /* undo what we did above */
1602 if (sigprocmask(SIG_SETMASK
, &saved_ss
, NULL
) < 0) {
1603 log_full_errno(prio
, errno
, "Failed to restore signal mask: %m");
1604 _exit(EXIT_FAILURE
);
1608 if (flags
& (FORK_DEATHSIG_SIGTERM
|FORK_DEATHSIG_SIGKILL
|FORK_DEATHSIG_SIGINT
)) {
1610 /* Let's see if the parent PID is still the one we started from? If not, then the parent
1611 * already died by the time we set PR_SET_PDEATHSIG, hence let's emulate the effect */
1615 /* Parent is in a different PID namespace. */;
1616 else if (ppid
!= original_pid
) {
1617 int sig
= fork_flags_to_signal(flags
);
1618 log_debug("Parent died early, raising %s.", signal_to_string(sig
));
1620 _exit(EXIT_FAILURE
);
1624 if (FLAGS_SET(flags
, FORK_NEW_MOUNTNS
| FORK_MOUNTNS_SLAVE
)) {
1625 /* Optionally, make sure we never propagate mounts to the host. */
1626 if (mount(NULL
, "/", NULL
, MS_SLAVE
| MS_REC
, NULL
) < 0) {
1627 log_full_errno(prio
, errno
, "Failed to remount root directory as MS_SLAVE: %m");
1628 _exit(EXIT_FAILURE
);
1632 if (FLAGS_SET(flags
, FORK_PRIVATE_TMP
)) {
1633 assert(FLAGS_SET(flags
, FORK_NEW_MOUNTNS
));
1635 /* Optionally, overmount new tmpfs instance on /tmp/. */
1636 r
= mount_nofollow("tmpfs", "/tmp", "tmpfs",
1638 "mode=01777" TMPFS_LIMITS_RUN
);
1640 log_full_errno(prio
, r
, "Failed to overmount /tmp/: %m");
1641 _exit(EXIT_FAILURE
);
1645 if (flags
& FORK_REARRANGE_STDIO
) {
1647 r
= rearrange_stdio(stdio_fds
[0], stdio_fds
[1], stdio_fds
[2]);
1649 log_full_errno(prio
, r
, "Failed to rearrange stdio fds: %m");
1650 _exit(EXIT_FAILURE
);
1653 /* Turn off O_NONBLOCK on the fdio fds, in case it was left on */
1654 stdio_disable_nonblock();
1656 r
= make_null_stdio();
1658 log_full_errno(prio
, r
, "Failed to connect stdin/stdout to /dev/null: %m");
1659 _exit(EXIT_FAILURE
);
1662 } else if (flags
& FORK_STDOUT_TO_STDERR
) {
1663 if (dup2(STDERR_FILENO
, STDOUT_FILENO
) < 0) {
1664 log_full_errno(prio
, errno
, "Failed to connect stdout to stderr: %m");
1665 _exit(EXIT_FAILURE
);
1669 if (flags
& FORK_CLOSE_ALL_FDS
) {
1670 /* Close the logs here in case it got reopened above, as close_all_fds() would close them for us */
1673 r
= close_all_fds(except_fds
, n_except_fds
);
1675 log_full_errno(prio
, r
, "Failed to close all file descriptors: %m");
1676 _exit(EXIT_FAILURE
);
1680 if (flags
& FORK_PACK_FDS
) {
1681 /* FORK_CLOSE_ALL_FDS ensures that except_fds are the only FDs >= 3 that are
1682 * open, this is including the log. This is required by pack_fds, which will
1683 * get stuck in an infinite loop of any FDs other than except_fds are open. */
1684 assert(FLAGS_SET(flags
, FORK_CLOSE_ALL_FDS
));
1686 r
= pack_fds(except_fds
, n_except_fds
);
1688 log_full_errno(prio
, r
, "Failed to pack file descriptors: %m");
1689 _exit(EXIT_FAILURE
);
1693 if (flags
& FORK_CLOEXEC_OFF
) {
1694 r
= fd_cloexec_many(except_fds
, n_except_fds
, false);
1696 log_full_errno(prio
, r
, "Failed to turn off O_CLOEXEC on file descriptors: %m");
1697 _exit(EXIT_FAILURE
);
1701 /* When we were asked to reopen the logs, do so again now */
1702 if (flags
& FORK_REOPEN_LOG
) {
1704 log_set_open_when_needed(false);
1707 if (flags
& FORK_RLIMIT_NOFILE_SAFE
) {
1708 r
= rlimit_nofile_safe();
1710 log_full_errno(prio
, r
, "Failed to lower RLIMIT_NOFILE's soft limit to 1K: %m");
1711 _exit(EXIT_FAILURE
);
1715 if (!FLAGS_SET(flags
, FORK_KEEP_NOTIFY_SOCKET
)) {
1716 r
= RET_NERRNO(unsetenv("NOTIFY_SOCKET"));
1718 log_full_errno(prio
, r
, "Failed to unset $NOTIFY_SOCKET: %m");
1719 _exit(EXIT_FAILURE
);
1724 *ret_pid
= getpid_cached();
1729 int pidref_safe_fork_full(
1731 const int stdio_fds
[3],
1733 size_t n_except_fds
,
1740 assert(!FLAGS_SET(flags
, FORK_WAIT
));
1742 r
= safe_fork_full(name
, stdio_fds
, except_fds
, n_except_fds
, flags
, &pid
);
1746 q
= pidref_set_pid(ret_pid
, pid
);
1747 if (q
< 0) /* Let's not fail for this, no matter what, the process exists after all, and that's key */
1748 *ret_pid
= PIDREF_MAKE_FROM_PID(pid
);
1754 const char *outer_name
,
1755 const char *inner_name
,
1757 size_t n_except_fds
,
1768 /* This is much like safe_fork(), but forks twice, and joins the specified namespaces in the middle
1769 * process. This ensures that we are fully a member of the destination namespace, with pidns an all, so that
1770 * /proc/self/fd works correctly. */
1772 r
= safe_fork_full(outer_name
,
1774 except_fds
, n_except_fds
,
1775 (flags
|FORK_DEATHSIG_SIGINT
|FORK_DEATHSIG_SIGTERM
|FORK_DEATHSIG_SIGKILL
) & ~(FORK_REOPEN_LOG
|FORK_NEW_MOUNTNS
|FORK_MOUNTNS_SLAVE
), ret_pid
);
1783 r
= namespace_enter(pidns_fd
, mntns_fd
, netns_fd
, userns_fd
, root_fd
);
1785 log_full_errno(FLAGS_SET(flags
, FORK_LOG
) ? LOG_ERR
: LOG_DEBUG
, r
, "Failed to join namespace: %m");
1786 _exit(EXIT_FAILURE
);
1789 /* We mask a few flags here that either make no sense for the grandchild, or that we don't have to do again */
1790 r
= safe_fork_full(inner_name
,
1792 except_fds
, n_except_fds
,
1793 flags
& ~(FORK_WAIT
|FORK_RESET_SIGNALS
|FORK_CLOSE_ALL_FDS
|FORK_REARRANGE_STDIO
), &pid
);
1795 _exit(EXIT_FAILURE
);
1803 r
= wait_for_terminate_and_check(inner_name
, pid
, FLAGS_SET(flags
, FORK_LOG
) ? WAIT_LOG
: 0);
1805 _exit(EXIT_FAILURE
);
1813 int set_oom_score_adjust(int value
) {
1814 char t
[DECIMAL_STR_MAX(int)];
1816 xsprintf(t
, "%i", value
);
1818 return write_string_file("/proc/self/oom_score_adj", t
,
1819 WRITE_STRING_FILE_VERIFY_ON_FAILURE
|WRITE_STRING_FILE_DISABLE_BUFFER
);
1822 int get_oom_score_adjust(int *ret
) {
1823 _cleanup_free_
char *t
= NULL
;
1826 r
= read_virtual_file("/proc/self/oom_score_adj", SIZE_MAX
, &t
, NULL
);
1830 delete_trailing_chars(t
, WHITESPACE
);
1832 assert_se(safe_atoi(t
, &a
) >= 0);
1833 assert_se(oom_score_adjust_is_valid(a
));
1840 int pidfd_get_pid(int fd
, pid_t
*ret
) {
1841 char path
[STRLEN("/proc/self/fdinfo/") + DECIMAL_STR_MAX(int)];
1842 _cleanup_free_
char *fdinfo
= NULL
;
1846 /* Converts a pidfd into a pid. Well known errors:
1848 * -EBADF → fd invalid
1849 * -ENOSYS → /proc/ not mounted
1850 * -ENOTTY → fd valid, but not a pidfd
1851 * -EREMOTE → fd valid, but pid is in another namespace we cannot translate to the local one
1852 * -ESRCH → fd valid, but process is already reaped
1858 xsprintf(path
, "/proc/self/fdinfo/%i", fd
);
1860 r
= read_full_virtual_file(path
, &fdinfo
, NULL
);
1861 if (r
== -ENOENT
) /* if fdinfo doesn't exist we assume the process does not exist */
1862 return proc_mounted() > 0 ? -EBADF
: -ENOSYS
;
1866 p
= find_line_startswith(fdinfo
, "Pid:");
1868 return -ENOTTY
; /* not a pidfd? */
1870 p
+= strspn(p
, WHITESPACE
);
1871 p
[strcspn(p
, WHITESPACE
)] = 0;
1874 return -EREMOTE
; /* PID is in foreign PID namespace? */
1876 return -ESRCH
; /* refers to reaped process? */
1878 return parse_pid(p
, ret
);
1881 int pidfd_verify_pid(int pidfd
, pid_t pid
) {
1888 r
= pidfd_get_pid(pidfd
, ¤t_pid
);
1892 return current_pid
!= pid
? -ESRCH
: 0;
1895 static int rlimit_to_nice(rlim_t limit
) {
1897 return PRIO_MAX
-1; /* i.e. 19 */
1899 if (limit
>= -PRIO_MIN
+ PRIO_MAX
)
1900 return PRIO_MIN
; /* i.e. -20 */
1902 return PRIO_MAX
- (int) limit
;
1905 int setpriority_closest(int priority
) {
1906 int current
, limit
, saved_errno
;
1907 struct rlimit highest
;
1909 /* Try to set requested nice level */
1910 if (setpriority(PRIO_PROCESS
, 0, priority
) >= 0)
1913 /* Permission failed */
1914 saved_errno
= -errno
;
1915 if (!ERRNO_IS_PRIVILEGE(saved_errno
))
1919 current
= getpriority(PRIO_PROCESS
, 0);
1923 if (priority
== current
)
1926 /* Hmm, we'd expect that raising the nice level from our status quo would always work. If it doesn't,
1927 * then the whole setpriority() system call is blocked to us, hence let's propagate the error
1929 if (priority
> current
)
1932 if (getrlimit(RLIMIT_NICE
, &highest
) < 0)
1935 limit
= rlimit_to_nice(highest
.rlim_cur
);
1937 /* We are already less nice than limit allows us */
1938 if (current
< limit
) {
1939 log_debug("Cannot raise nice level, permissions and the resource limit do not allow it.");
1943 /* Push to the allowed limit */
1944 if (setpriority(PRIO_PROCESS
, 0, limit
) < 0)
1947 log_debug("Cannot set requested nice level (%i), used next best (%i).", priority
, limit
);
1951 _noreturn_
void freeze(void) {
1954 /* Make sure nobody waits for us (i.e. on one of our sockets) anymore. Note that we use
1955 * close_all_fds_without_malloc() instead of plain close_all_fds() here, since we want this function
1956 * to be compatible with being called from signal handlers. */
1957 (void) close_all_fds_without_malloc(NULL
, 0);
1959 /* Let's not freeze right away, but keep reaping zombies. */
1963 if (waitid(P_ALL
, 0, &si
, WEXITED
) < 0 && errno
!= EINTR
)
1967 /* waitid() failed with an unexpected error, things are really borked. Freeze now! */
1972 int get_process_threads(pid_t pid
) {
1973 _cleanup_free_
char *t
= NULL
;
1980 p
= procfs_file_alloca(pid
, "status");
1982 r
= get_proc_field(p
, "Threads", WHITESPACE
, &t
);
1984 return proc_mounted() == 0 ? -ENOSYS
: -ESRCH
;
1988 r
= safe_atoi(t
, &n
);
1997 int is_reaper_process(void) {
2000 /* Checks if we are running in a reaper process, i.e. if we are expected to deal with processes
2001 * reparented to us. This simply checks if we are PID 1 or if PR_SET_CHILD_SUBREAPER was called. */
2003 if (getpid_cached() == 1)
2006 if (prctl(PR_GET_CHILD_SUBREAPER
, (unsigned long) &b
, 0UL, 0UL, 0UL) < 0)
2012 int make_reaper_process(bool b
) {
2014 if (getpid_cached() == 1) {
2022 /* Some prctl()s insist that all 5 arguments are specified, others do not. Let's always specify all,
2023 * to avoid any ambiguities */
2024 if (prctl(PR_SET_CHILD_SUBREAPER
, (unsigned long) b
, 0UL, 0UL, 0UL) < 0)
2030 DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(posix_spawnattr_t
*, posix_spawnattr_destroy
, NULL
);
2032 int posix_spawn_wrapper(
2037 PidRef
*ret_pidref
) {
2039 short flags
= POSIX_SPAWN_SETSIGMASK
|POSIX_SPAWN_SETSIGDEF
;
2040 posix_spawnattr_t attr
;
2044 /* Forks and invokes 'path' with 'argv' and 'envp' using CLONE_VM and CLONE_VFORK, which means the
2045 * caller will be blocked until the child either exits or exec's. The memory of the child will be
2046 * fully shared with the memory of the parent, so that there are no copy-on-write or memory.max
2049 * Also, move the newly-created process into 'cgroup' through POSIX_SPAWN_SETCGROUP (clone3())
2050 * if available. Note that CLONE_INTO_CGROUP is only supported on cgroup v2.
2051 * returns 1: We're already in the right cgroup
2052 * 0: 'cgroup' not specified or POSIX_SPAWN_SETCGROUP is not supported. The caller
2053 * needs to call 'cg_attach' on their own */
2059 assert_se(sigfillset(&mask
) >= 0);
2061 r
= posix_spawnattr_init(&attr
);
2063 return -r
; /* These functions return a positive errno on failure */
2065 /* Initialization needs to succeed before we can set up a destructor. */
2066 _unused_
_cleanup_(posix_spawnattr_destroyp
) posix_spawnattr_t
*attr_destructor
= &attr
;
2068 #if HAVE_PIDFD_SPAWN
2069 _cleanup_close_
int cgroup_fd
= -EBADF
;
2072 _cleanup_free_
char *resolved_cgroup
= NULL
;
2074 r
= cg_get_path_and_check(
2075 SYSTEMD_CGROUP_CONTROLLER
,
2082 cgroup_fd
= open(resolved_cgroup
, O_PATH
|O_DIRECTORY
|O_CLOEXEC
);
2086 r
= posix_spawnattr_setcgroup_np(&attr
, cgroup_fd
);
2090 flags
|= POSIX_SPAWN_SETCGROUP
;
2094 r
= posix_spawnattr_setflags(&attr
, flags
);
2097 r
= posix_spawnattr_setsigmask(&attr
, &mask
);
2101 #if HAVE_PIDFD_SPAWN
2102 _cleanup_close_
int pidfd
= -EBADF
;
2104 r
= pidfd_spawn(&pidfd
, path
, NULL
, &attr
, argv
, envp
);
2106 r
= pidref_set_pidfd_consume(ret_pidref
, TAKE_FD(pidfd
));
2110 return FLAGS_SET(flags
, POSIX_SPAWN_SETCGROUP
);
2112 if (ERRNO_IS_NOT_SUPPORTED(r
)) {
2113 /* clone3() could also return EOPNOTSUPP if the target cgroup is in threaded mode. */
2114 if (cgroup
&& cg_is_threaded(cgroup
) > 0)
2117 /* clone3() not available? */
2118 } else if (!ERRNO_IS_PRIVILEGE(r
))
2121 /* Compiled on a newer host, or seccomp&friends blocking clone3()? Fallback, but need to change the
2122 * flags to remove the cgroup one, which is what redirects to clone3() */
2123 flags
&= ~POSIX_SPAWN_SETCGROUP
;
2124 r
= posix_spawnattr_setflags(&attr
, flags
);
2130 r
= posix_spawn(&pid
, path
, NULL
, &attr
, argv
, envp
);
2134 r
= pidref_set_pid(ret_pidref
, pid
);
2138 return 0; /* We did not use CLONE_INTO_CGROUP so return 0, the caller will have to move the child */
2141 int proc_dir_open(DIR **ret
) {
2146 d
= opendir("/proc");
2154 int proc_dir_read(DIR *d
, pid_t
*ret
) {
2161 de
= readdir_no_dot(d
);
2169 if (!IN_SET(de
->d_type
, DT_DIR
, DT_UNKNOWN
))
2172 if (parse_pid(de
->d_name
, ret
) >= 0)
2181 int proc_dir_read_pidref(DIR *d
, PidRef
*ret
) {
2189 r
= proc_dir_read(d
, &pid
);
2195 r
= pidref_set_pid(ret
, pid
);
2196 if (r
== -ESRCH
) /* gone by now? skip it */
2209 static const char *const sigchld_code_table
[] = {
2210 [CLD_EXITED
] = "exited",
2211 [CLD_KILLED
] = "killed",
2212 [CLD_DUMPED
] = "dumped",
2213 [CLD_TRAPPED
] = "trapped",
2214 [CLD_STOPPED
] = "stopped",
2215 [CLD_CONTINUED
] = "continued",
2218 DEFINE_STRING_TABLE_LOOKUP(sigchld_code
, int);
2220 static const char* const sched_policy_table
[] = {
2221 [SCHED_OTHER
] = "other",
2222 [SCHED_BATCH
] = "batch",
2223 [SCHED_IDLE
] = "idle",
2224 [SCHED_FIFO
] = "fifo",
2228 DEFINE_STRING_TABLE_LOOKUP_WITH_FALLBACK(sched_policy
, int, INT_MAX
);