]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/basic/process-util.c
process-util: use proc_mounted() check at one more place
[thirdparty/systemd.git] / src / basic / process-util.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
0b452006 2
4f5dd394 3#include <ctype.h>
0b452006 4#include <errno.h>
11c3a366
TA
5#include <limits.h>
6#include <linux/oom.h>
1b46eb23 7#include <pthread.h>
6ecdfe7d 8#include <spawn.h>
4f5dd394
LP
9#include <stdbool.h>
10#include <stdio.h>
11c3a366 11#include <stdlib.h>
e2047ba9 12#include <sys/mount.h>
7b3e062c 13#include <sys/personality.h>
405f8907 14#include <sys/prctl.h>
4f5dd394
LP
15#include <sys/types.h>
16#include <sys/wait.h>
11c3a366 17#include <syslog.h>
4f5dd394 18#include <unistd.h>
349cc4a5 19#if HAVE_VALGRIND_VALGRIND_H
dcadc967
EV
20#include <valgrind/valgrind.h>
21#endif
0b452006 22
ad5db940
OJ
23#include "sd-messages.h"
24
b5efdb8a 25#include "alloc-util.h"
6e5f1b57 26#include "architecture.h"
ee617a4e 27#include "argv-util.h"
eefb7d22 28#include "dirent-util.h"
3ec2ad35 29#include "env-file.h"
21c491e1 30#include "env-util.h"
39090201 31#include "errno-util.h"
aad3a64d 32#include "escape.h"
3ffd4af2 33#include "fd-util.h"
0b452006 34#include "fileio.h"
f4f15635 35#include "fs-util.h"
3ec2ad35 36#include "hostname-util.h"
e3b4efd2 37#include "locale-util.h"
0b452006 38#include "log.h"
11c3a366 39#include "macro.h"
0a970718 40#include "memory-util.h"
f5947a5e
YW
41#include "missing_sched.h"
42#include "missing_syscall.h"
5545f336 43#include "missing_threads.h"
61ef3051 44#include "mountpoint-util.h"
0cb8e3d1 45#include "namespace-util.h"
08af3cc5 46#include "nulstr-util.h"
3ec2ad35 47#include "parse-util.h"
aad3a64d 48#include "path-util.h"
93cc7779 49#include "process-util.h"
8869a0b4 50#include "raw-clone.h"
909106eb 51#include "rlimit-util.h"
93cc7779 52#include "signal-util.h"
1359fffa 53#include "stat-util.h"
298f466f 54#include "stdio-util.h"
7b3e062c 55#include "string-table.h"
07630cea 56#include "string-util.h"
4c253ed1 57#include "terminal-util.h"
b1d4f8e1 58#include "user-util.h"
bc28751e 59#include "utf8.h"
0b452006 60
0e85cbcf
ZJS
61/* The kernel limits userspace processes to TASK_COMM_LEN (16 bytes), but allows higher values for its own
62 * workers, e.g. "kworker/u9:3-kcryptd/253:0". Let's pick a fixed smallish limit that will work for the kernel.
63 */
64#define COMM_MAX_LEN 128
65
0a51b45c 66static int get_process_state(pid_t pid) {
5c7b9974 67 _cleanup_free_ char *line = NULL;
0b452006
RC
68 const char *p;
69 char state;
70 int r;
0b452006
RC
71
72 assert(pid >= 0);
73
5c7b9974
LP
74 /* Shortcut: if we are enquired about our own state, we are obviously running */
75 if (pid == 0 || pid == getpid_cached())
76 return (unsigned char) 'R';
77
0b452006 78 p = procfs_file_alloca(pid, "stat");
a644184a 79
0b452006 80 r = read_one_line_file(p, &line);
a644184a
LP
81 if (r == -ENOENT)
82 return -ESRCH;
0b452006
RC
83 if (r < 0)
84 return r;
85
86 p = strrchr(line, ')');
87 if (!p)
88 return -EIO;
89
90 p++;
91
92 if (sscanf(p, " %c", &state) != 1)
93 return -EIO;
94
95 return (unsigned char) state;
96}
97
d7d74854 98int pid_get_comm(pid_t pid, char **ret) {
ce268825 99 _cleanup_free_ char *escaped = NULL, *comm = NULL;
0b452006
RC
100 int r;
101
ce268825 102 assert(ret);
0b452006
RC
103 assert(pid >= 0);
104
cde93ba2
LP
105 if (pid == 0 || pid == getpid_cached()) {
106 comm = new0(char, TASK_COMM_LEN + 1); /* Must fit in 16 byte according to prctl(2) */
107 if (!comm)
108 return -ENOMEM;
109
110 if (prctl(PR_GET_NAME, comm) < 0)
111 return -errno;
112 } else {
113 const char *p;
114
115 p = procfs_file_alloca(pid, "comm");
116
117 /* Note that process names of kernel threads can be much longer than TASK_COMM_LEN */
118 r = read_one_line_file(p, &comm);
119 if (r == -ENOENT)
120 return -ESRCH;
121 if (r < 0)
122 return r;
123 }
124
0e85cbcf 125 escaped = new(char, COMM_MAX_LEN);
ce268825
LP
126 if (!escaped)
127 return -ENOMEM;
128
ce268825 129 /* Escape unprintable characters, just in case, but don't grow the string beyond the underlying size */
0e85cbcf 130 cellescape(escaped, COMM_MAX_LEN, comm);
0b452006 131
ce268825
LP
132 *ret = TAKE_PTR(escaped);
133 return 0;
0b452006
RC
134}
135
d7d74854
LP
136int pidref_get_comm(const PidRef *pid, char **ret) {
137 _cleanup_free_ char *comm = NULL;
138 int r;
139
140 if (!pidref_is_set(pid))
141 return -ESRCH;
142
143 r = pid_get_comm(pid->pid, &comm);
144 if (r < 0)
145 return r;
146
147 r = pidref_verify(pid);
148 if (r < 0)
149 return r;
150
151 if (ret)
152 *ret = TAKE_PTR(comm);
153 return 0;
154}
155
a034620f 156static int pid_get_cmdline_nulstr(
61977664
ZJS
157 pid_t pid,
158 size_t max_size,
159 ProcessCmdlineFlags flags,
160 char **ret,
161 size_t *ret_size) {
162
a034620f 163 _cleanup_free_ char *t = NULL;
0b452006 164 const char *p;
bc28751e 165 size_t k;
7b7a060e 166 int r;
0b452006 167
61977664
ZJS
168 /* Retrieves a process' command line as a "sized nulstr", i.e. possibly without the last NUL, but
169 * with a specified size.
69281c49 170 *
61977664
ZJS
171 * If PROCESS_CMDLINE_COMM_FALLBACK is specified in flags and the process has no command line set
172 * (the case for kernel threads), or has a command line that resolves to the empty string, will
173 * return the "comm" name of the process instead. This will use at most _SC_ARG_MAX bytes of input
174 * data.
175 *
176 * Returns an error, 0 if output was read but is truncated, 1 otherwise.
177 */
69281c49 178
0b452006 179 p = procfs_file_alloca(pid, "cmdline");
61977664
ZJS
180 r = read_virtual_file(p, max_size, &t, &k); /* Let's assume that each input byte results in >= 1
181 * columns of output. We ignore zero-width codepoints. */
fdeea3f4
ZJS
182 if (r == -ENOENT)
183 return -ESRCH;
184 if (r < 0)
185 return r;
35bbbf85 186
61977664 187 if (k == 0) {
09c1dcee 188 if (!(flags & PROCESS_CMDLINE_COMM_FALLBACK))
0b452006
RC
189 return -ENOENT;
190
bc28751e 191 /* Kernel threads have no argv[] */
61977664 192 _cleanup_free_ char *comm = NULL;
69281c49 193
d7d74854 194 r = pid_get_comm(pid, &comm);
bc28751e
ZJS
195 if (r < 0)
196 return r;
69281c49 197
a034620f 198 free(t);
61977664 199 t = strjoin("[", comm, "]");
bc28751e
ZJS
200 if (!t)
201 return -ENOMEM;
61977664
ZJS
202
203 k = strlen(t);
204 r = k <= max_size;
205 if (r == 0) /* truncation */
206 t[max_size] = '\0';
0b452006
RC
207 }
208
a034620f
LP
209 if (ret)
210 *ret = TAKE_PTR(t);
211 if (ret_size)
212 *ret_size = k;
213
61977664
ZJS
214 return r;
215}
eb1ec489 216
a034620f 217int pid_get_cmdline(pid_t pid, size_t max_columns, ProcessCmdlineFlags flags, char **ret) {
61977664
ZJS
218 _cleanup_free_ char *t = NULL;
219 size_t k;
220 char *ans;
e3b4efd2 221
61977664 222 assert(pid >= 0);
95a511b7 223 assert(ret);
61977664 224
7c52d523 225 /* Retrieve and format a command line. See above for discussion of retrieval options.
61977664
ZJS
226 *
227 * There are two main formatting modes:
228 *
229 * - when PROCESS_CMDLINE_QUOTE is specified, output is quoted in C/Python style. If no shell special
230 * characters are present, this output can be copy-pasted into the terminal to execute. UTF-8
231 * output is assumed.
232 *
233 * - otherwise, a compact non-roundtrippable form is returned. Non-UTF8 bytes are replaced by �. The
234 * returned string is of the specified console width at most, abbreviated with an ellipsis.
235 *
236 * Returns -ESRCH if the process doesn't exist, and -ENOENT if the process has no command line (and
237 * PROCESS_CMDLINE_COMM_FALLBACK is not specified). Returns 0 and sets *line otherwise. */
238
a034620f 239 int full = pid_get_cmdline_nulstr(pid, max_columns, flags, &t, &k);
61977664
ZJS
240 if (full < 0)
241 return full;
242
99009ed0
ZJS
243 if (flags & (PROCESS_CMDLINE_QUOTE | PROCESS_CMDLINE_QUOTE_POSIX)) {
244 ShellEscapeFlags shflags = SHELL_ESCAPE_EMPTY |
245 FLAGS_SET(flags, PROCESS_CMDLINE_QUOTE_POSIX) * SHELL_ESCAPE_POSIX;
246
61977664
ZJS
247 assert(!(flags & PROCESS_CMDLINE_USE_LOCALE));
248
249 _cleanup_strv_free_ char **args = NULL;
250
4669be62
YW
251 /* Drop trailing NULs, otherwise strv_parse_nulstr() adds additional empty strings at the end.
252 * See also issue #21186. */
253 args = strv_parse_nulstr_full(t, k, /* drop_trailing_nuls = */ true);
61977664
ZJS
254 if (!args)
255 return -ENOMEM;
256
5e659ffc 257 ans = quote_command_line(args, shflags);
61977664
ZJS
258 if (!ans)
259 return -ENOMEM;
61977664
ZJS
260 } else {
261 /* Arguments are separated by NULs. Let's replace those with spaces. */
262 for (size_t i = 0; i < k - 1; i++)
263 if (t[i] == '\0')
264 t[i] = ' ';
265
266 delete_trailing_chars(t, WHITESPACE);
267
268 bool eight_bit = (flags & PROCESS_CMDLINE_USE_LOCALE) && !is_locale_utf8();
269
270 ans = escape_non_printable_full(t, max_columns,
271 eight_bit * XESCAPE_8_BIT | !full * XESCAPE_FORCE_ELLIPSIS);
272 if (!ans)
273 return -ENOMEM;
274
275 ans = str_realloc(ans);
276 }
eb1ec489 277
95a511b7 278 *ret = ans;
3ec2ad35
ZJS
279 return 0;
280}
281
a034620f
LP
282int pidref_get_cmdline(const PidRef *pid, size_t max_columns, ProcessCmdlineFlags flags, char **ret) {
283 _cleanup_free_ char *s = NULL;
284 int r;
285
286 if (!pidref_is_set(pid))
287 return -ESRCH;
288
289 r = pid_get_cmdline(pid->pid, max_columns, flags, &s);
290 if (r < 0)
291 return r;
292
293 r = pidref_verify(pid);
294 if (r < 0)
295 return r;
296
297 if (ret)
298 *ret = TAKE_PTR(s);
299 return 0;
300}
301
302int pid_get_cmdline_strv(pid_t pid, ProcessCmdlineFlags flags, char ***ret) {
201423d8
YW
303 _cleanup_free_ char *t = NULL;
304 char **args;
305 size_t k;
306 int r;
307
308 assert(pid >= 0);
309 assert((flags & ~PROCESS_CMDLINE_COMM_FALLBACK) == 0);
310 assert(ret);
311
a034620f 312 r = pid_get_cmdline_nulstr(pid, SIZE_MAX, flags, &t, &k);
201423d8
YW
313 if (r < 0)
314 return r;
315
316 args = strv_parse_nulstr_full(t, k, /* drop_trailing_nuls = */ true);
317 if (!args)
318 return -ENOMEM;
319
320 *ret = args;
321 return 0;
322}
323
a034620f
LP
324int pidref_get_cmdline_strv(const PidRef *pid, ProcessCmdlineFlags flags, char ***ret) {
325 _cleanup_strv_free_ char **args = NULL;
326 int r;
327
328 if (!pidref_is_set(pid))
329 return -ESRCH;
330
331 r = pid_get_cmdline_strv(pid->pid, flags, &args);
332 if (r < 0)
333 return r;
334
335 r = pidref_verify(pid);
336 if (r < 0)
337 return r;
338
339 if (ret)
340 *ret = TAKE_PTR(args);
341
342 return 0;
343}
344
3ec2ad35
ZJS
345int container_get_leader(const char *machine, pid_t *pid) {
346 _cleanup_free_ char *s = NULL, *class = NULL;
347 const char *p;
348 pid_t leader;
349 int r;
350
351 assert(machine);
352 assert(pid);
353
354 if (streq(machine, ".host")) {
355 *pid = 1;
356 return 0;
357 }
358
359 if (!hostname_is_valid(machine, 0))
360 return -EINVAL;
361
362 p = strjoina("/run/systemd/machines/", machine);
363 r = parse_env_file(NULL, p,
364 "LEADER", &s,
365 "CLASS", &class);
366 if (r == -ENOENT)
367 return -EHOSTDOWN;
368 if (r < 0)
369 return r;
370 if (!s)
371 return -EIO;
372
373 if (!streq_ptr(class, "container"))
374 return -EIO;
375
376 r = parse_pid(s, &leader);
377 if (r < 0)
378 return r;
379 if (leader <= 1)
380 return -EIO;
381
382 *pid = leader;
0b452006
RC
383 return 0;
384}
385
ade39d9a
NR
386int namespace_get_leader(pid_t pid, NamespaceType type, pid_t *ret) {
387 int r;
388
389 assert(ret);
390
391 for (;;) {
392 pid_t ppid;
393
394 r = get_process_ppid(pid, &ppid);
395 if (r < 0)
396 return r;
397
398 r = in_same_namespace(pid, ppid, type);
399 if (r < 0)
400 return r;
401 if (r == 0) {
402 /* If the parent and the child are not in the same
403 * namespace, then the child is the leader we are
404 * looking for. */
405 *ret = pid;
406 return 0;
407 }
408
409 pid = ppid;
410 }
411}
412
fc87713b 413int pid_is_kernel_thread(pid_t pid) {
36b5119a
LP
414 _cleanup_free_ char *line = NULL;
415 unsigned long long flags;
416 size_t l, i;
0b452006 417 const char *p;
36b5119a
LP
418 char *q;
419 int r;
0b452006 420
4c701096 421 if (IN_SET(pid, 0, 1) || pid == getpid_cached()) /* pid 1, and we ourselves certainly aren't a kernel thread */
0b452006 422 return 0;
36b5119a
LP
423 if (!pid_is_valid(pid))
424 return -EINVAL;
0b452006 425
36b5119a
LP
426 p = procfs_file_alloca(pid, "stat");
427 r = read_one_line_file(p, &line);
428 if (r == -ENOENT)
429 return -ESRCH;
430 if (r < 0)
431 return r;
0b452006 432
36b5119a
LP
433 /* Skip past the comm field */
434 q = strrchr(line, ')');
435 if (!q)
436 return -EINVAL;
437 q++;
438
439 /* Skip 6 fields to reach the flags field */
440 for (i = 0; i < 6; i++) {
441 l = strspn(q, WHITESPACE);
442 if (l < 1)
443 return -EINVAL;
444 q += l;
445
446 l = strcspn(q, WHITESPACE);
447 if (l < 1)
448 return -EINVAL;
449 q += l;
a644184a 450 }
0b452006 451
f21f31b2 452 /* Skip preceding whitespace */
36b5119a
LP
453 l = strspn(q, WHITESPACE);
454 if (l < 1)
455 return -EINVAL;
456 q += l;
35bbbf85 457
36b5119a
LP
458 /* Truncate the rest */
459 l = strcspn(q, WHITESPACE);
460 if (l < 1)
461 return -EINVAL;
462 q[l] = 0;
0b452006 463
36b5119a
LP
464 r = safe_atollu(q, &flags);
465 if (r < 0)
466 return r;
0b452006 467
36b5119a 468 return !!(flags & PF_KTHREAD);
0b452006
RC
469}
470
fc87713b
LP
471int pidref_is_kernel_thread(const PidRef *pid) {
472 int result, r;
473
474 if (!pidref_is_set(pid))
475 return -ESRCH;
476
477 result = pid_is_kernel_thread(pid->pid);
478 if (result < 0)
479 return result;
480
481 r = pidref_verify(pid); /* Verify that the PID wasn't reused since */
482 if (r < 0)
483 return r;
484
485 return result;
486}
487
95a511b7 488int get_process_capeff(pid_t pid, char **ret) {
0b452006 489 const char *p;
a644184a 490 int r;
0b452006 491
0b452006 492 assert(pid >= 0);
95a511b7 493 assert(ret);
0b452006
RC
494
495 p = procfs_file_alloca(pid, "status");
496
95a511b7 497 r = get_proc_field(p, "CapEff", WHITESPACE, ret);
a644184a
LP
498 if (r == -ENOENT)
499 return -ESRCH;
500
501 return r;
0b452006
RC
502}
503
aed3c5ec
LP
504static int get_process_link_contents(pid_t pid, const char *proc_file, char **ret) {
505 const char *p;
0b452006
RC
506 int r;
507
508 assert(proc_file);
0b452006 509
aed3c5ec 510 p = procfs_file_alloca(pid, proc_file);
0b452006 511
aed3c5ec 512 r = readlink_malloc(p, ret);
234bdd9c 513 return (r == -ENOENT && proc_mounted() > 0) ? -ESRCH : r;
0b452006
RC
514}
515
95a511b7 516int get_process_exe(pid_t pid, char **ret) {
0b452006
RC
517 char *d;
518 int r;
519
520 assert(pid >= 0);
521
aed3c5ec 522 r = get_process_link_contents(pid, "exe", ret);
0b452006
RC
523 if (r < 0)
524 return r;
525
aed3c5ec
LP
526 if (ret) {
527 d = endswith(*ret, " (deleted)");
528 if (d)
529 *d = '\0';
530 }
0b452006
RC
531
532 return 0;
533}
534
95a511b7 535static int get_process_id(pid_t pid, const char *field, uid_t *ret) {
0b452006 536 _cleanup_fclose_ FILE *f = NULL;
0b452006 537 const char *p;
7e7a16a0 538 int r;
0b452006
RC
539
540 assert(field);
95a511b7 541 assert(ret);
0b452006 542
07b38ba5 543 if (pid < 0)
6f8cbcdb
LP
544 return -EINVAL;
545
0b452006 546 p = procfs_file_alloca(pid, "status");
fdeea3f4
ZJS
547 r = fopen_unlocked(p, "re", &f);
548 if (r == -ENOENT)
549 return -ESRCH;
550 if (r < 0)
551 return r;
35bbbf85 552
7e7a16a0
LP
553 for (;;) {
554 _cleanup_free_ char *line = NULL;
0b452006
RC
555 char *l;
556
0ff6ff2b 557 r = read_stripped_line(f, LONG_LINE_MAX, &line);
7e7a16a0
LP
558 if (r < 0)
559 return r;
560 if (r == 0)
561 break;
562
0ff6ff2b
LP
563 l = startswith(line, field);
564 if (l) {
0b452006
RC
565 l += strspn(l, WHITESPACE);
566
567 l[strcspn(l, WHITESPACE)] = 0;
568
95a511b7 569 return parse_uid(l, ret);
0b452006
RC
570 }
571 }
572
573 return -EIO;
574}
575
8b513415
LP
576int pid_get_uid(pid_t pid, uid_t *ret) {
577 assert(ret);
6f8cbcdb
LP
578
579 if (pid == 0 || pid == getpid_cached()) {
95a511b7 580 *ret = getuid();
6f8cbcdb
LP
581 return 0;
582 }
583
95a511b7 584 return get_process_id(pid, "Uid:", ret);
0b452006
RC
585}
586
8b513415
LP
587int pidref_get_uid(const PidRef *pid, uid_t *ret) {
588 uid_t uid;
589 int r;
590
591 if (!pidref_is_set(pid))
592 return -ESRCH;
593
594 r = pid_get_uid(pid->pid, &uid);
595 if (r < 0)
596 return r;
597
598 r = pidref_verify(pid);
599 if (r < 0)
600 return r;
601
602 if (ret)
603 *ret = uid;
604 return 0;
605}
606
95a511b7 607int get_process_gid(pid_t pid, gid_t *ret) {
6f8cbcdb
LP
608
609 if (pid == 0 || pid == getpid_cached()) {
95a511b7 610 *ret = getgid();
6f8cbcdb
LP
611 return 0;
612 }
613
0b452006 614 assert_cc(sizeof(uid_t) == sizeof(gid_t));
95a511b7 615 return get_process_id(pid, "Gid:", ret);
0b452006
RC
616}
617
95a511b7 618int get_process_cwd(pid_t pid, char **ret) {
0b452006
RC
619 assert(pid >= 0);
620
aad3a64d 621 if (pid == 0 || pid == getpid_cached())
95a511b7 622 return safe_getcwd(ret);
aad3a64d 623
aed3c5ec 624 return get_process_link_contents(pid, "cwd", ret);
0b452006
RC
625}
626
95a511b7 627int get_process_root(pid_t pid, char **ret) {
0b452006 628 assert(pid >= 0);
aed3c5ec 629 return get_process_link_contents(pid, "root", ret);
0b452006
RC
630}
631
2a7797e9
LP
632#define ENVIRONMENT_BLOCK_MAX (5U*1024U*1024U)
633
95a511b7 634int get_process_environ(pid_t pid, char **ret) {
0b452006
RC
635 _cleanup_fclose_ FILE *f = NULL;
636 _cleanup_free_ char *outcome = NULL;
319a4f4b 637 size_t sz = 0;
2a7797e9
LP
638 const char *p;
639 int r;
0b452006
RC
640
641 assert(pid >= 0);
95a511b7 642 assert(ret);
0b452006
RC
643
644 p = procfs_file_alloca(pid, "environ");
645
fdeea3f4
ZJS
646 r = fopen_unlocked(p, "re", &f);
647 if (r == -ENOENT)
648 return -ESRCH;
649 if (r < 0)
650 return r;
35bbbf85 651
2a7797e9
LP
652 for (;;) {
653 char c;
654
655 if (sz >= ENVIRONMENT_BLOCK_MAX)
656 return -ENOBUFS;
657
319a4f4b 658 if (!GREEDY_REALLOC(outcome, sz + 5))
0b452006
RC
659 return -ENOMEM;
660
2a7797e9
LP
661 r = safe_fgetc(f, &c);
662 if (r < 0)
663 return r;
664 if (r == 0)
665 break;
666
0b452006
RC
667 if (c == '\0')
668 outcome[sz++] = '\n';
669 else
670 sz += cescape_char(c, outcome + sz);
671 }
672
2a7797e9 673 outcome[sz] = '\0';
95a511b7 674 *ret = TAKE_PTR(outcome);
0b452006
RC
675
676 return 0;
677}
678
0c4d1e6d 679int get_process_ppid(pid_t pid, pid_t *ret) {
0b452006 680 _cleanup_free_ char *line = NULL;
da185cd0 681 unsigned long ppid;
0b452006 682 const char *p;
0c4d1e6d 683 int r;
0b452006
RC
684
685 assert(pid >= 0);
0b452006 686
6f8cbcdb 687 if (pid == 0 || pid == getpid_cached()) {
0c4d1e6d
LP
688 if (ret)
689 *ret = getppid();
0b452006
RC
690 return 0;
691 }
692
0c4d1e6d
LP
693 if (pid == 1) /* PID 1 has no parent, shortcut this case */
694 return -EADDRNOTAVAIL;
695
0b452006
RC
696 p = procfs_file_alloca(pid, "stat");
697 r = read_one_line_file(p, &line);
a644184a
LP
698 if (r == -ENOENT)
699 return -ESRCH;
0b452006
RC
700 if (r < 0)
701 return r;
702
0c4d1e6d
LP
703 /* Let's skip the pid and comm fields. The latter is enclosed in () but does not escape any () in its
704 * value, so let's skip over it manually */
0b452006
RC
705
706 p = strrchr(line, ')');
707 if (!p)
708 return -EIO;
709
710 p++;
711
712 if (sscanf(p, " "
713 "%*c " /* state */
714 "%lu ", /* ppid */
715 &ppid) != 1)
716 return -EIO;
717
0c4d1e6d
LP
718 /* If ppid is zero the process has no parent. Which might be the case for PID 1 but also for
719 * processes originating in other namespaces that are inserted into a pidns. Return a recognizable
720 * error in this case. */
721 if (ppid == 0)
722 return -EADDRNOTAVAIL;
723
da185cd0 724 if ((pid_t) ppid < 0 || (unsigned long) (pid_t) ppid != ppid)
0b452006
RC
725 return -ERANGE;
726
0c4d1e6d
LP
727 if (ret)
728 *ret = (pid_t) ppid;
0b452006
RC
729
730 return 0;
731}
732
3dee63b7
LP
733int pid_get_start_time(pid_t pid, uint64_t *ret) {
734 _cleanup_free_ char *line = NULL;
735 const char *p;
736 int r;
737
738 assert(pid >= 0);
739
740 p = procfs_file_alloca(pid, "stat");
741 r = read_one_line_file(p, &line);
742 if (r == -ENOENT)
743 return -ESRCH;
744 if (r < 0)
745 return r;
746
747 /* Let's skip the pid and comm fields. The latter is enclosed in () but does not escape any () in its
748 * value, so let's skip over it manually */
749
750 p = strrchr(line, ')');
751 if (!p)
752 return -EIO;
753
754 p++;
755
756 unsigned long llu;
757
758 if (sscanf(p, " "
759 "%*c " /* state */
760 "%*u " /* ppid */
761 "%*u " /* pgrp */
762 "%*u " /* session */
763 "%*u " /* tty_nr */
764 "%*u " /* tpgid */
765 "%*u " /* flags */
766 "%*u " /* minflt */
767 "%*u " /* cminflt */
768 "%*u " /* majflt */
769 "%*u " /* cmajflt */
770 "%*u " /* utime */
771 "%*u " /* stime */
772 "%*u " /* cutime */
773 "%*u " /* cstime */
774 "%*i " /* priority */
775 "%*i " /* nice */
776 "%*u " /* num_threads */
777 "%*u " /* itrealvalue */
778 "%lu ", /* starttime */
779 &llu) != 1)
780 return -EIO;
781
782 if (ret)
783 *ret = llu;
784
785 return 0;
786}
787
788int pidref_get_start_time(const PidRef *pid, uint64_t *ret) {
789 uint64_t t;
790 int r;
791
792 if (!pidref_is_set(pid))
793 return -ESRCH;
794
795 r = pid_get_start_time(pid->pid, ret ? &t : NULL);
796 if (r < 0)
797 return r;
798
799 r = pidref_verify(pid);
800 if (r < 0)
801 return r;
802
803 if (ret)
804 *ret = t;
805
806 return 0;
807}
808
95a511b7 809int get_process_umask(pid_t pid, mode_t *ret) {
5e37d193
FB
810 _cleanup_free_ char *m = NULL;
811 const char *p;
812 int r;
813
5e37d193 814 assert(pid >= 0);
95a511b7 815 assert(ret);
5e37d193
FB
816
817 p = procfs_file_alloca(pid, "status");
818
819 r = get_proc_field(p, "Umask", WHITESPACE, &m);
820 if (r == -ENOENT)
821 return -ESRCH;
ece6fc51
LP
822 if (r < 0)
823 return r;
5e37d193 824
95a511b7 825 return parse_mode(m, ret);
5e37d193
FB
826}
827
0b452006
RC
828int wait_for_terminate(pid_t pid, siginfo_t *status) {
829 siginfo_t dummy;
830
831 assert(pid >= 1);
832
833 if (!status)
834 status = &dummy;
835
836 for (;;) {
837 zero(*status);
838
839 if (waitid(P_PID, pid, status, WEXITED) < 0) {
840
841 if (errno == EINTR)
842 continue;
843
3f0083a2 844 return negative_errno();
0b452006
RC
845 }
846
847 return 0;
848 }
849}
850
851/*
852 * Return values:
853 * < 0 : wait_for_terminate() failed to get the state of the
854 * process, the process was terminated by a signal, or
855 * failed for an unknown reason.
856 * >=0 : The process terminated normally, and its exit code is
857 * returned.
858 *
859 * That is, success is indicated by a return value of zero, and an
860 * error is indicated by a non-zero value.
861 *
862 * A warning is emitted if the process terminates abnormally,
863 * and also if it returns non-zero unless check_exit_code is true.
864 */
7d4904fe
LP
865int wait_for_terminate_and_check(const char *name, pid_t pid, WaitFlags flags) {
866 _cleanup_free_ char *buffer = NULL;
0b452006 867 siginfo_t status;
7d4904fe 868 int r, prio;
0b452006 869
0b452006
RC
870 assert(pid > 1);
871
7d4904fe 872 if (!name) {
d7d74854 873 r = pid_get_comm(pid, &buffer);
7d4904fe
LP
874 if (r < 0)
875 log_debug_errno(r, "Failed to acquire process name of " PID_FMT ", ignoring: %m", pid);
876 else
877 name = buffer;
878 }
879
880 prio = flags & WAIT_LOG_ABNORMAL ? LOG_ERR : LOG_DEBUG;
881
0b452006
RC
882 r = wait_for_terminate(pid, &status);
883 if (r < 0)
7d4904fe 884 return log_full_errno(prio, r, "Failed to wait for %s: %m", strna(name));
0b452006
RC
885
886 if (status.si_code == CLD_EXITED) {
7d4904fe
LP
887 if (status.si_status != EXIT_SUCCESS)
888 log_full(flags & WAIT_LOG_NON_ZERO_EXIT_STATUS ? LOG_ERR : LOG_DEBUG,
889 "%s failed with exit status %i.", strna(name), status.si_status);
0b452006
RC
890 else
891 log_debug("%s succeeded.", name);
892
893 return status.si_status;
7d4904fe 894
3742095b 895 } else if (IN_SET(status.si_code, CLD_KILLED, CLD_DUMPED)) {
0b452006 896
7d4904fe 897 log_full(prio, "%s terminated by signal %s.", strna(name), signal_to_string(status.si_status));
0b452006
RC
898 return -EPROTO;
899 }
900
7d4904fe 901 log_full(prio, "%s failed due to unknown reason.", strna(name));
0b452006
RC
902 return -EPROTO;
903}
904
d5641e0d
KW
905/*
906 * Return values:
e225e5c3
LP
907 *
908 * < 0 : wait_for_terminate_with_timeout() failed to get the state of the process, the process timed out, the process
909 * was terminated by a signal, or failed for an unknown reason.
910 *
d5641e0d
KW
911 * >=0 : The process terminated normally with no failures.
912 *
e225e5c3
LP
913 * Success is indicated by a return value of zero, a timeout is indicated by ETIMEDOUT, and all other child failure
914 * states are indicated by error is indicated by a non-zero value.
915 *
916 * This call assumes SIGCHLD has been blocked already, in particular before the child to wait for has been forked off
917 * to remain entirely race-free.
d5641e0d
KW
918 */
919int wait_for_terminate_with_timeout(pid_t pid, usec_t timeout) {
920 sigset_t mask;
921 int r;
922 usec_t until;
923
924 assert_se(sigemptyset(&mask) == 0);
925 assert_se(sigaddset(&mask, SIGCHLD) == 0);
926
927 /* Drop into a sigtimewait-based timeout. Waiting for the
928 * pid to exit. */
496db330 929 until = usec_add(now(CLOCK_MONOTONIC), timeout);
d5641e0d
KW
930 for (;;) {
931 usec_t n;
932 siginfo_t status = {};
d5641e0d
KW
933
934 n = now(CLOCK_MONOTONIC);
935 if (n >= until)
936 break;
937
52bb308c 938 r = RET_NERRNO(sigtimedwait(&mask, NULL, TIMESPEC_STORE(until - n)));
d5641e0d
KW
939 /* Assuming we woke due to the child exiting. */
940 if (waitid(P_PID, pid, &status, WEXITED|WNOHANG) == 0) {
941 if (status.si_pid == pid) {
7802194a 942 /* This is the correct child. */
d5641e0d 943 if (status.si_code == CLD_EXITED)
0b6a4795 944 return status.si_status == 0 ? 0 : -EPROTO;
d5641e0d
KW
945 else
946 return -EPROTO;
947 }
948 }
949 /* Not the child, check for errors and proceed appropriately */
950 if (r < 0) {
951 switch (r) {
952 case -EAGAIN:
953 /* Timed out, child is likely hung. */
954 return -ETIMEDOUT;
955 case -EINTR:
956 /* Received a different signal and should retry */
957 continue;
958 default:
959 /* Return any unexpected errors */
960 return r;
961 }
962 }
963 }
964
965 return -EPROTO;
966}
967
89c9030d
LP
968void sigkill_wait(pid_t pid) {
969 assert(pid > 1);
970
2c161210
LP
971 (void) kill(pid, SIGKILL);
972 (void) wait_for_terminate(pid, NULL);
89c9030d
LP
973}
974
975void sigkill_waitp(pid_t *pid) {
dfd14786
LP
976 PROTECT_ERRNO;
977
4d0d3d41
LP
978 if (!pid)
979 return;
980 if (*pid <= 1)
981 return;
982
89c9030d 983 sigkill_wait(*pid);
4d0d3d41
LP
984}
985
392cf1d0
SL
986void sigterm_wait(pid_t pid) {
987 assert(pid > 1);
988
2c161210
LP
989 (void) kill_and_sigcont(pid, SIGTERM);
990 (void) wait_for_terminate(pid, NULL);
392cf1d0
SL
991}
992
b293bb23
LP
993void sigkill_nowait(pid_t pid) {
994 assert(pid > 1);
995
996 (void) kill(pid, SIGKILL);
997}
998
999void sigkill_nowaitp(pid_t *pid) {
1000 PROTECT_ERRNO;
1001
1002 if (!pid)
1003 return;
1004 if (*pid <= 1)
1005 return;
1006
1007 sigkill_nowait(*pid);
1008}
1009
0b452006
RC
1010int kill_and_sigcont(pid_t pid, int sig) {
1011 int r;
1012
7c248223 1013 r = RET_NERRNO(kill(pid, sig));
0b452006 1014
26f417d3
LP
1015 /* If this worked, also send SIGCONT, unless we already just sent a SIGCONT, or SIGKILL was sent which isn't
1016 * affected by a process being suspended anyway. */
a3d8d68c 1017 if (r >= 0 && !IN_SET(sig, SIGCONT, SIGKILL))
26f417d3 1018 (void) kill(pid, SIGCONT);
0b452006
RC
1019
1020 return r;
1021}
1022
e70f4453 1023int getenv_for_pid(pid_t pid, const char *field, char **ret) {
0b452006
RC
1024 _cleanup_fclose_ FILE *f = NULL;
1025 char *value = NULL;
0b452006 1026 const char *path;
aa9ff6c2 1027 size_t sum = 0;
0d90bd92 1028 int r;
0b452006
RC
1029
1030 assert(pid >= 0);
1031 assert(field);
e70f4453
LP
1032 assert(ret);
1033
1034 if (pid == 0 || pid == getpid_cached()) {
1035 const char *e;
1036
1037 e = getenv(field);
1038 if (!e) {
1039 *ret = NULL;
1040 return 0;
1041 }
1042
1043 value = strdup(e);
1044 if (!value)
1045 return -ENOMEM;
1046
1047 *ret = value;
1048 return 1;
1049 }
0b452006 1050
0d90bd92
LP
1051 if (!pid_is_valid(pid))
1052 return -EINVAL;
1053
0b452006
RC
1054 path = procfs_file_alloca(pid, "environ");
1055
fdeea3f4
ZJS
1056 r = fopen_unlocked(path, "re", &f);
1057 if (r == -ENOENT)
1058 return -ESRCH;
1059 if (r < 0)
1060 return r;
35bbbf85 1061
0d90bd92
LP
1062 for (;;) {
1063 _cleanup_free_ char *line = NULL;
aa9ff6c2 1064 const char *match;
0b452006 1065
0d90bd92
LP
1066 if (sum > ENVIRONMENT_BLOCK_MAX) /* Give up searching eventually */
1067 return -ENOBUFS;
0b452006 1068
0d90bd92
LP
1069 r = read_nul_string(f, LONG_LINE_MAX, &line);
1070 if (r < 0)
1071 return r;
1072 if (r == 0) /* EOF */
1073 break;
0b452006 1074
0d90bd92 1075 sum += r;
0b452006 1076
aa9ff6c2
R
1077 match = startswith(line, field);
1078 if (match && *match == '=') {
1079 value = strdup(match + 1);
0b452006
RC
1080 if (!value)
1081 return -ENOMEM;
1082
e70f4453
LP
1083 *ret = value;
1084 return 1;
0b452006 1085 }
0d90bd92 1086 }
0b452006 1087
e70f4453
LP
1088 *ret = NULL;
1089 return 0;
0b452006
RC
1090}
1091
4d051546
FB
1092int pid_is_my_child(pid_t pid) {
1093 pid_t ppid;
1094 int r;
1095
6774be42
LP
1096 if (pid < 0)
1097 return -ESRCH;
1098
4d051546
FB
1099 if (pid <= 1)
1100 return false;
1101
1102 r = get_process_ppid(pid, &ppid);
1103 if (r < 0)
1104 return r;
1105
1106 return ppid == getpid_cached();
1107}
1108
6774be42
LP
1109int pidref_is_my_child(const PidRef *pid) {
1110 int r, result;
1111
1112 if (!pidref_is_set(pid))
1113 return -ESRCH;
1114
1115 result = pid_is_my_child(pid->pid);
1116 if (result < 0)
1117 return result;
1118
1119 r = pidref_verify(pid);
1120 if (r < 0)
1121 return r;
1122
1123 return result;
1124}
1125
4d9f092b 1126int pid_is_unwaited(pid_t pid) {
0b452006
RC
1127 /* Checks whether a PID is still valid at all, including a zombie */
1128
07b38ba5 1129 if (pid < 0)
4d9f092b 1130 return -ESRCH;
0b452006 1131
5fd9b2c5
LP
1132 if (pid <= 1) /* If we or PID 1 would be dead and have been waited for, this code would not be running */
1133 return true;
1134
6f8cbcdb
LP
1135 if (pid == getpid_cached())
1136 return true;
1137
0b452006
RC
1138 if (kill(pid, 0) >= 0)
1139 return true;
1140
1141 return errno != ESRCH;
1142}
1143
4d9f092b
LP
1144int pidref_is_unwaited(const PidRef *pid) {
1145 int r;
1146
1147 if (!pidref_is_set(pid))
1148 return -ESRCH;
1149
1150 if (pid->pid == 1 || pidref_is_self(pid))
1151 return true;
1152
1153 r = pidref_kill(pid, 0);
1154 if (r == -ESRCH)
1155 return false;
1156 if (r < 0)
1157 return r;
1158
1159 return true;
1160}
1161
becdfcb9 1162int pid_is_alive(pid_t pid) {
0b452006
RC
1163 int r;
1164
1165 /* Checks whether a PID is still valid and not a zombie */
1166
07b38ba5 1167 if (pid < 0)
becdfcb9 1168 return -ESRCH;
0b452006 1169
5fd9b2c5
LP
1170 if (pid <= 1) /* If we or PID 1 would be a zombie, this code would not be running */
1171 return true;
1172
6f8cbcdb
LP
1173 if (pid == getpid_cached())
1174 return true;
1175
0b452006 1176 r = get_process_state(pid);
becdfcb9 1177 if (r == -ESRCH)
0b452006 1178 return false;
becdfcb9
LP
1179 if (r < 0)
1180 return r;
1181
1182 return r != 'Z';
1183}
1184
1185int pidref_is_alive(const PidRef *pidref) {
1186 int r, result;
1187
1188 if (!pidref_is_set(pidref))
1189 return -ESRCH;
0b452006 1190
becdfcb9 1191 result = pid_is_alive(pidref->pid);
faf0dd4b
MY
1192 if (result < 0) {
1193 assert(result != -ESRCH);
becdfcb9 1194 return result;
faf0dd4b 1195 }
becdfcb9
LP
1196
1197 r = pidref_verify(pidref);
1198 if (r == -ESRCH)
1199 return false;
1200 if (r < 0)
1201 return r;
1202
1203 return result;
0b452006 1204}
d4510856 1205
1359fffa
MS
1206int pid_from_same_root_fs(pid_t pid) {
1207 const char *root;
1208
07b38ba5 1209 if (pid < 0)
6f8cbcdb
LP
1210 return false;
1211
1212 if (pid == 0 || pid == getpid_cached())
1213 return true;
1359fffa
MS
1214
1215 root = procfs_file_alloca(pid, "root");
1216
563e6846 1217 return inode_same(root, "/proc/1/root", 0);
1359fffa
MS
1218}
1219
d4510856
LP
1220bool is_main_thread(void) {
1221 static thread_local int cached = 0;
1222
1223 if (_unlikely_(cached == 0))
df0ff127 1224 cached = getpid_cached() == gettid() ? 1 : -1;
d4510856
LP
1225
1226 return cached > 0;
1227}
7b3e062c 1228
c4412d4d
LP
1229bool oom_score_adjust_is_valid(int oa) {
1230 return oa >= OOM_SCORE_ADJ_MIN && oa <= OOM_SCORE_ADJ_MAX;
1231}
1232
7b3e062c 1233unsigned long personality_from_string(const char *p) {
6b41a7b2 1234 Architecture architecture;
7b3e062c 1235
0c0fea07
LP
1236 if (!p)
1237 return PERSONALITY_INVALID;
1238
6e5f1b57
LP
1239 /* Parse a personality specifier. We use our own identifiers that indicate specific ABIs, rather than just
1240 * hints regarding the register size, since we want to keep things open for multiple locally supported ABIs for
1241 * the same register size. */
1242
1243 architecture = architecture_from_string(p);
1244 if (architecture < 0)
1245 return PERSONALITY_INVALID;
7b3e062c 1246
0c0fea07 1247 if (architecture == native_architecture())
7b3e062c 1248 return PER_LINUX;
3c58ae13
DDM
1249#ifdef ARCHITECTURE_SECONDARY
1250 if (architecture == ARCHITECTURE_SECONDARY)
f2d1736c 1251 return PER_LINUX32;
7b3e062c
LP
1252#endif
1253
1254 return PERSONALITY_INVALID;
1255}
1256
1257const char* personality_to_string(unsigned long p) {
6b41a7b2 1258 Architecture architecture = _ARCHITECTURE_INVALID;
7b3e062c 1259
7b3e062c 1260 if (p == PER_LINUX)
0c0fea07 1261 architecture = native_architecture();
3c58ae13 1262#ifdef ARCHITECTURE_SECONDARY
6e5f1b57 1263 else if (p == PER_LINUX32)
3c58ae13 1264 architecture = ARCHITECTURE_SECONDARY;
7b3e062c
LP
1265#endif
1266
6e5f1b57
LP
1267 if (architecture < 0)
1268 return NULL;
1269
1270 return architecture_to_string(architecture);
7b3e062c
LP
1271}
1272
21022b9d
LP
1273int safe_personality(unsigned long p) {
1274 int ret;
1275
1276 /* So here's the deal, personality() is weirdly defined by glibc. In some cases it returns a failure via errno,
1277 * and in others as negative return value containing an errno-like value. Let's work around this: this is a
1278 * wrapper that uses errno if it is set, and uses the return value otherwise. And then it sets both errno and
1279 * the return value indicating the same issue, so that we are definitely on the safe side.
1280 *
1281 * See https://github.com/systemd/systemd/issues/6737 */
1282
1283 errno = 0;
1284 ret = personality(p);
1285 if (ret < 0) {
1286 if (errno != 0)
1287 return -errno;
1288
1289 errno = -ret;
1290 }
1291
1292 return ret;
1293}
1294
e8132d63
LP
1295int opinionated_personality(unsigned long *ret) {
1296 int current;
1297
1298 /* Returns the current personality, or PERSONALITY_INVALID if we can't determine it. This function is a bit
1299 * opinionated though, and ignores all the finer-grained bits and exotic personalities, only distinguishing the
1300 * two most relevant personalities: PER_LINUX and PER_LINUX32. */
1301
21022b9d 1302 current = safe_personality(PERSONALITY_INVALID);
e8132d63 1303 if (current < 0)
21022b9d 1304 return current;
e8132d63 1305
3dc51ab2 1306 if (((unsigned long) current & OPINIONATED_PERSONALITY_MASK) == PER_LINUX32)
e8132d63
LP
1307 *ret = PER_LINUX32;
1308 else
1309 *ret = PER_LINUX;
1310
1311 return 0;
1312}
1313
dcadc967 1314void valgrind_summary_hack(void) {
349cc4a5 1315#if HAVE_VALGRIND_VALGRIND_H
df0ff127 1316 if (getpid_cached() == 1 && RUNNING_ON_VALGRIND) {
dcadc967 1317 pid_t pid;
8869a0b4 1318 pid = raw_clone(SIGCHLD);
dcadc967 1319 if (pid < 0)
ad5db940
OJ
1320 log_struct_errno(
1321 LOG_EMERG, errno,
1322 "MESSAGE_ID=" SD_MESSAGE_VALGRIND_HELPER_FORK_STR,
1323 LOG_MESSAGE( "Failed to fork off valgrind helper: %m"));
dcadc967
EV
1324 else if (pid == 0)
1325 exit(EXIT_SUCCESS);
1326 else {
1327 log_info("Spawned valgrind helper as PID "PID_FMT".", pid);
1328 (void) wait_for_terminate(pid, NULL);
1329 }
1330 }
1331#endif
1332}
1333
93bab288 1334int pid_compare_func(const pid_t *a, const pid_t *b) {
291d565a 1335 /* Suitable for usage in qsort() */
93bab288 1336 return CMP(*a, *b);
291d565a
LP
1337}
1338
5c30a6d2
LP
1339/* The cached PID, possible values:
1340 *
1341 * == UNSET [0] → cache not initialized yet
1342 * == BUSY [-1] → some thread is initializing it at the moment
1343 * any other → the cached PID
1344 */
1345
1346#define CACHED_PID_UNSET ((pid_t) 0)
1347#define CACHED_PID_BUSY ((pid_t) -1)
1348
1349static pid_t cached_pid = CACHED_PID_UNSET;
1350
799a960d 1351void reset_cached_pid(void) {
5c30a6d2
LP
1352 /* Invoked in the child after a fork(), i.e. at the first moment the PID changed */
1353 cached_pid = CACHED_PID_UNSET;
1354}
1355
5c30a6d2 1356pid_t getpid_cached(void) {
5d71bac3 1357 static bool installed = false;
8a75ba0a 1358 pid_t current_value = CACHED_PID_UNSET;
5c30a6d2
LP
1359
1360 /* getpid_cached() is much like getpid(), but caches the value in local memory, to avoid having to invoke a
1361 * system call each time. This restores glibc behaviour from before 2.24, when getpid() was unconditionally
1362 * cached. Starting with 2.24 getpid() started to become prohibitively expensive when used for detecting when
1363 * objects were used across fork()s. With this caching the old behaviour is somewhat restored.
1364 *
1365 * https://bugzilla.redhat.com/show_bug.cgi?id=1443976
a4041e4f 1366 * https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=c579f48edba88380635ab98cb612030e3ed8691e
5c30a6d2
LP
1367 */
1368
5000cea8 1369 (void) __atomic_compare_exchange_n(
8a75ba0a 1370 &cached_pid,
1371 &current_value,
1372 CACHED_PID_BUSY,
1373 false,
1374 __ATOMIC_SEQ_CST,
1375 __ATOMIC_SEQ_CST);
5c30a6d2
LP
1376
1377 switch (current_value) {
1378
1379 case CACHED_PID_UNSET: { /* Not initialized yet, then do so now */
1380 pid_t new_pid;
1381
996def17 1382 new_pid = raw_getpid();
5c30a6d2 1383
5d71bac3
LP
1384 if (!installed) {
1385 /* __register_atfork() either returns 0 or -ENOMEM, in its glibc implementation. Since it's
1386 * only half-documented (glibc doesn't document it but LSB does — though only superficially)
1387 * we'll check for errors only in the most generic fashion possible. */
1388
1b46eb23 1389 if (pthread_atfork(NULL, NULL, reset_cached_pid) != 0) {
5d71bac3
LP
1390 /* OOM? Let's try again later */
1391 cached_pid = CACHED_PID_UNSET;
1392 return new_pid;
1393 }
1394
1395 installed = true;
5c30a6d2
LP
1396 }
1397
1398 cached_pid = new_pid;
1399 return new_pid;
1400 }
1401
1402 case CACHED_PID_BUSY: /* Somebody else is currently initializing */
996def17 1403 return raw_getpid();
5c30a6d2
LP
1404
1405 default: /* Properly initialized */
1406 return current_value;
1407 }
1408}
1409
fba868fa
LP
1410int must_be_root(void) {
1411
1412 if (geteuid() == 0)
1413 return 0;
1414
baaa35ad 1415 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Need to be root.");
fba868fa
LP
1416}
1417
4b3abcd0
MG
1418static void restore_sigsetp(sigset_t **ssp) {
1419 if (*ssp)
1420 (void) sigprocmask(SIG_SETMASK, *ssp, NULL);
1421}
1422
29c3520f
LP
1423pid_t clone_with_nested_stack(int (*fn)(void *), int flags, void *userdata) {
1424 size_t ps;
1425 pid_t pid;
1426 void *mystack;
1427
1428 /* A wrapper around glibc's clone() call that automatically sets up a "nested" stack. Only supports
1429 * invocations without CLONE_VM, so that we can continue to use the parent's stack mapping.
1430 *
1431 * Note: glibc's clone() wrapper does not synchronize malloc() locks. This means that if the parent
1432 * is threaded these locks will be in an undefined state in the child, and hence memory allocations
1433 * are likely going to run into deadlocks. Hence: if you use this function make sure your parent is
1434 * strictly single-threaded or your child never calls malloc(). */
1435
1436 assert((flags & (CLONE_VM|CLONE_PARENT_SETTID|CLONE_CHILD_SETTID|
1437 CLONE_CHILD_CLEARTID|CLONE_SETTLS)) == 0);
1438
1439 /* We allocate some space on the stack to use as the stack for the child (hence "nested"). Note that
1440 * the net effect is that the child will have the start of its stack inside the stack of the parent,
1441 * but since they are a CoW copy of each other that's fine. We allocate one page-aligned page. But
1442 * since we don't want to deal with differences between systems where the stack grows backwards or
1443 * forwards we'll allocate one more and place the stack address in the middle. Except that we also
1444 * want it page aligned, hence we'll allocate one page more. Makes 3. */
1445
1446 ps = page_size();
1447 mystack = alloca(ps*3);
1448 mystack = (uint8_t*) mystack + ps; /* move pointer one page ahead since stacks usually grow backwards */
1449 mystack = (void*) ALIGN_TO((uintptr_t) mystack, ps); /* align to page size (moving things further ahead) */
1450
840ac5cd 1451#if HAVE_CLONE
29c3520f 1452 pid = clone(fn, mystack, flags, userdata);
840ac5cd
LB
1453#else
1454 pid = __clone2(fn, mystack, ps, flags, userdata);
1455#endif
29c3520f
LP
1456 if (pid < 0)
1457 return -errno;
1458
1459 return pid;
1460}
1461
e9ccae31
LP
1462static int fork_flags_to_signal(ForkFlags flags) {
1463 return (flags & FORK_DEATHSIG_SIGTERM) ? SIGTERM :
1464 (flags & FORK_DEATHSIG_SIGINT) ? SIGINT :
1465 SIGKILL;
1466}
1467
4c253ed1
LP
1468int safe_fork_full(
1469 const char *name,
911f8f01 1470 const int stdio_fds[3],
85f660d4 1471 int except_fds[],
4c253ed1
LP
1472 size_t n_except_fds,
1473 ForkFlags flags,
1474 pid_t *ret_pid) {
1475
1476 pid_t original_pid, pid;
1f5d1e02 1477 sigset_t saved_ss, ss;
d7ac0952 1478 _unused_ _cleanup_(restore_sigsetp) sigset_t *saved_ssp = NULL;
2e7b105e 1479 bool block_signals = false, block_all = false, intermediary = false;
b6e1fff1 1480 int prio, r;
4c253ed1 1481
2e7b105e
LP
1482 assert(!FLAGS_SET(flags, FORK_DETACH) || !ret_pid);
1483 assert(!FLAGS_SET(flags, FORK_DETACH|FORK_WAIT));
1484
4c253ed1
LP
1485 /* A wrapper around fork(), that does a couple of important initializations in addition to mere forking. Always
1486 * returns the child's PID in *ret_pid. Returns == 0 in the child, and > 0 in the parent. */
1487
b6e1fff1
LP
1488 prio = flags & FORK_LOG ? LOG_ERR : LOG_DEBUG;
1489
4c253ed1
LP
1490 original_pid = getpid_cached();
1491
48f813c4
LP
1492 if (flags & FORK_FLUSH_STDIO) {
1493 fflush(stdout);
1494 fflush(stderr); /* This one shouldn't be necessary, stderr should be unbuffered anyway, but let's better be safe than sorry */
1495 }
1496
e9ccae31
LP
1497 if (flags & (FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|FORK_DEATHSIG_SIGINT)) {
1498 /* We temporarily block all signals, so that the new child has them blocked initially. This
1499 * way, we can be sure that SIGTERMs are not lost we might send to the child. (Note that for
1500 * FORK_DEATHSIG_SIGKILL we don't bother, since it cannot be blocked anyway.) */
4c253ed1 1501
cd2a429e 1502 assert_se(sigfillset(&ss) >= 0);
d7e38115 1503 block_signals = block_all = true;
1f5d1e02
LP
1504
1505 } else if (flags & FORK_WAIT) {
1f5d1e02
LP
1506 /* Let's block SIGCHLD at least, so that we can safely watch for the child process */
1507
cd2a429e
ZJS
1508 assert_se(sigemptyset(&ss) >= 0);
1509 assert_se(sigaddset(&ss, SIGCHLD) >= 0);
1f5d1e02 1510 block_signals = true;
4c253ed1
LP
1511 }
1512
4b3abcd0 1513 if (block_signals) {
1f5d1e02
LP
1514 if (sigprocmask(SIG_SETMASK, &ss, &saved_ss) < 0)
1515 return log_full_errno(prio, errno, "Failed to set signal mask: %m");
4b3abcd0
MG
1516 saved_ssp = &saved_ss;
1517 }
1f5d1e02 1518
2e7b105e
LP
1519 if (FLAGS_SET(flags, FORK_DETACH)) {
1520 assert(!FLAGS_SET(flags, FORK_WAIT));
1521 assert(!ret_pid);
1522
1523 /* Fork off intermediary child if needed */
1524
1525 r = is_reaper_process();
1526 if (r < 0)
1527 return log_full_errno(prio, r, "Failed to determine if we are a reaper process: %m");
1528
1529 if (!r) {
1530 /* Not a reaper process, hence do a double fork() so we are reparented to one */
1531
1532 pid = fork();
1533 if (pid < 0)
1534 return log_full_errno(prio, errno, "Failed to fork off '%s': %m", strna(name));
1535 if (pid > 0) {
1536 log_debug("Successfully forked off intermediary '%s' as PID " PID_FMT ".", strna(name), pid);
1537 return 1; /* return in the parent */
1538 }
1539
1540 intermediary = true;
1541 }
1542 }
1543
387f39ea 1544 if ((flags & (FORK_NEW_MOUNTNS|FORK_NEW_USERNS|FORK_NEW_NETNS)) != 0)
5f968736
LP
1545 pid = raw_clone(SIGCHLD|
1546 (FLAGS_SET(flags, FORK_NEW_MOUNTNS) ? CLONE_NEWNS : 0) |
387f39ea
YW
1547 (FLAGS_SET(flags, FORK_NEW_USERNS) ? CLONE_NEWUSER : 0) |
1548 (FLAGS_SET(flags, FORK_NEW_NETNS) ? CLONE_NEWNET : 0));
be39f6ee
LP
1549 else
1550 pid = fork();
4b3abcd0 1551 if (pid < 0)
b9fadf2e 1552 return log_full_errno(prio, errno, "Failed to fork off '%s': %m", strna(name));
4c253ed1 1553 if (pid > 0) {
4c253ed1 1554
2e7b105e
LP
1555 /* If we are in the intermediary process, exit now */
1556 if (intermediary)
1557 _exit(EXIT_SUCCESS);
1558
1559 /* We are in the parent process */
1f5d1e02
LP
1560 log_debug("Successfully forked off '%s' as PID " PID_FMT ".", strna(name), pid);
1561
1562 if (flags & FORK_WAIT) {
d7e38115
MG
1563 if (block_all) {
1564 /* undo everything except SIGCHLD */
1565 ss = saved_ss;
1566 assert_se(sigaddset(&ss, SIGCHLD) >= 0);
1567 (void) sigprocmask(SIG_SETMASK, &ss, NULL);
1568 }
1569
1f5d1e02
LP
1570 r = wait_for_terminate_and_check(name, pid, (flags & FORK_LOG ? WAIT_LOG : 0));
1571 if (r < 0)
1572 return r;
1573 if (r != EXIT_SUCCESS) /* exit status > 0 should be treated as failure, too */
1574 return -EPROTO;
1575 }
1576
4c253ed1
LP
1577 if (ret_pid)
1578 *ret_pid = pid;
1579
1580 return 1;
1581 }
1582
1583 /* We are in the child process */
1584
4b3abcd0
MG
1585 /* Restore signal mask manually */
1586 saved_ssp = NULL;
1587
4c253ed1
LP
1588 if (flags & FORK_REOPEN_LOG) {
1589 /* Close the logs if requested, before we log anything. And make sure we reopen it if needed. */
1590 log_close();
1591 log_set_open_when_needed(true);
a3b00f91 1592 log_settle_target();
4c253ed1
LP
1593 }
1594
1595 if (name) {
1596 r = rename_process(name);
1597 if (r < 0)
b6e1fff1
LP
1598 log_full_errno(flags & FORK_LOG ? LOG_WARNING : LOG_DEBUG,
1599 r, "Failed to rename process, ignoring: %m");
4c253ed1
LP
1600 }
1601
e9ccae31
LP
1602 if (flags & (FORK_DEATHSIG_SIGTERM|FORK_DEATHSIG_SIGINT|FORK_DEATHSIG_SIGKILL))
1603 if (prctl(PR_SET_PDEATHSIG, fork_flags_to_signal(flags)) < 0) {
b6e1fff1 1604 log_full_errno(prio, errno, "Failed to set death signal: %m");
4c253ed1
LP
1605 _exit(EXIT_FAILURE);
1606 }
1607
1608 if (flags & FORK_RESET_SIGNALS) {
1609 r = reset_all_signal_handlers();
1610 if (r < 0) {
b6e1fff1 1611 log_full_errno(prio, r, "Failed to reset signal handlers: %m");
4c253ed1
LP
1612 _exit(EXIT_FAILURE);
1613 }
1614
1615 /* This implicitly undoes the signal mask stuff we did before the fork()ing above */
1616 r = reset_signal_mask();
1617 if (r < 0) {
b6e1fff1 1618 log_full_errno(prio, r, "Failed to reset signal mask: %m");
4c253ed1
LP
1619 _exit(EXIT_FAILURE);
1620 }
1621 } else if (block_signals) { /* undo what we did above */
1622 if (sigprocmask(SIG_SETMASK, &saved_ss, NULL) < 0) {
b6e1fff1 1623 log_full_errno(prio, errno, "Failed to restore signal mask: %m");
4c253ed1
LP
1624 _exit(EXIT_FAILURE);
1625 }
1626 }
1627
e9ccae31 1628 if (flags & (FORK_DEATHSIG_SIGTERM|FORK_DEATHSIG_SIGKILL|FORK_DEATHSIG_SIGINT)) {
7ddc2dc5 1629 pid_t ppid;
4c253ed1
LP
1630 /* Let's see if the parent PID is still the one we started from? If not, then the parent
1631 * already died by the time we set PR_SET_PDEATHSIG, hence let's emulate the effect */
1632
7ddc2dc5
SL
1633 ppid = getppid();
1634 if (ppid == 0)
d7b34e38 1635 /* Parent is in a different PID namespace. */;
7ddc2dc5 1636 else if (ppid != original_pid) {
e9ccae31
LP
1637 int sig = fork_flags_to_signal(flags);
1638 log_debug("Parent died early, raising %s.", signal_to_string(sig));
1639 (void) raise(sig);
4c253ed1
LP
1640 _exit(EXIT_FAILURE);
1641 }
1642 }
1643
d94a24ca 1644 if (FLAGS_SET(flags, FORK_NEW_MOUNTNS | FORK_MOUNTNS_SLAVE)) {
e2047ba9 1645 /* Optionally, make sure we never propagate mounts to the host. */
e2047ba9
LP
1646 if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) < 0) {
1647 log_full_errno(prio, errno, "Failed to remount root directory as MS_SLAVE: %m");
1648 _exit(EXIT_FAILURE);
1649 }
1650 }
1651
61ef3051
ZJS
1652 if (FLAGS_SET(flags, FORK_PRIVATE_TMP)) {
1653 assert(FLAGS_SET(flags, FORK_NEW_MOUNTNS));
1654
1655 /* Optionally, overmount new tmpfs instance on /tmp/. */
1656 r = mount_nofollow("tmpfs", "/tmp", "tmpfs",
1657 MS_NOSUID|MS_NODEV,
1658 "mode=01777" TMPFS_LIMITS_RUN);
1659 if (r < 0) {
1660 log_full_errno(prio, r, "Failed to overmount /tmp/: %m");
1661 _exit(EXIT_FAILURE);
1662 }
1663 }
1664
911f8f01
YW
1665 if (flags & FORK_REARRANGE_STDIO) {
1666 if (stdio_fds) {
1667 r = rearrange_stdio(stdio_fds[0], stdio_fds[1], stdio_fds[2]);
1668 if (r < 0) {
1669 log_full_errno(prio, r, "Failed to rearrange stdio fds: %m");
1670 _exit(EXIT_FAILURE);
1671 }
3b1e80f7
LP
1672
1673 /* Turn off O_NONBLOCK on the fdio fds, in case it was left on */
1674 stdio_disable_nonblock();
911f8f01
YW
1675 } else {
1676 r = make_null_stdio();
1677 if (r < 0) {
1678 log_full_errno(prio, r, "Failed to connect stdin/stdout to /dev/null: %m");
1679 _exit(EXIT_FAILURE);
1680 }
1681 }
1682 } else if (flags & FORK_STDOUT_TO_STDERR) {
1683 if (dup2(STDERR_FILENO, STDOUT_FILENO) < 0) {
1684 log_full_errno(prio, errno, "Failed to connect stdout to stderr: %m");
1685 _exit(EXIT_FAILURE);
1686 }
1687 }
1688
4c253ed1
LP
1689 if (flags & FORK_CLOSE_ALL_FDS) {
1690 /* Close the logs here in case it got reopened above, as close_all_fds() would close them for us */
1691 log_close();
1692
1693 r = close_all_fds(except_fds, n_except_fds);
1694 if (r < 0) {
b6e1fff1 1695 log_full_errno(prio, r, "Failed to close all file descriptors: %m");
4c253ed1
LP
1696 _exit(EXIT_FAILURE);
1697 }
1698 }
1699
85f660d4
AV
1700 if (flags & FORK_PACK_FDS) {
1701 /* FORK_CLOSE_ALL_FDS ensures that except_fds are the only FDs >= 3 that are
1702 * open, this is including the log. This is required by pack_fds, which will
1703 * get stuck in an infinite loop of any FDs other than except_fds are open. */
1704 assert(FLAGS_SET(flags, FORK_CLOSE_ALL_FDS));
1705
1706 r = pack_fds(except_fds, n_except_fds);
1707 if (r < 0) {
1708 log_full_errno(prio, r, "Failed to pack file descriptors: %m");
1709 _exit(EXIT_FAILURE);
1710 }
1711 }
1712
981cfbe0
LP
1713 if (flags & FORK_CLOEXEC_OFF) {
1714 r = fd_cloexec_many(except_fds, n_except_fds, false);
1715 if (r < 0) {
1716 log_full_errno(prio, r, "Failed to turn off O_CLOEXEC on file descriptors: %m");
1717 _exit(EXIT_FAILURE);
1718 }
1719 }
1720
4c253ed1
LP
1721 /* When we were asked to reopen the logs, do so again now */
1722 if (flags & FORK_REOPEN_LOG) {
1723 log_open();
1724 log_set_open_when_needed(false);
1725 }
1726
909106eb
LP
1727 if (flags & FORK_RLIMIT_NOFILE_SAFE) {
1728 r = rlimit_nofile_safe();
1729 if (r < 0) {
1730 log_full_errno(prio, r, "Failed to lower RLIMIT_NOFILE's soft limit to 1K: %m");
1731 _exit(EXIT_FAILURE);
1732 }
1733 }
1734
2cd04086
YW
1735 if (!FLAGS_SET(flags, FORK_KEEP_NOTIFY_SOCKET)) {
1736 r = RET_NERRNO(unsetenv("NOTIFY_SOCKET"));
1737 if (r < 0) {
1738 log_full_errno(prio, r, "Failed to unset $NOTIFY_SOCKET: %m");
1739 _exit(EXIT_FAILURE);
1740 }
1741 }
1742
4c253ed1
LP
1743 if (ret_pid)
1744 *ret_pid = getpid_cached();
1745
1746 return 0;
1747}
1748
f1713226
LP
1749int pidref_safe_fork_full(
1750 const char *name,
1751 const int stdio_fds[3],
85f660d4 1752 int except_fds[],
f1713226
LP
1753 size_t n_except_fds,
1754 ForkFlags flags,
1755 PidRef *ret_pid) {
1756
1757 pid_t pid;
1758 int r, q;
1759
1760 assert(!FLAGS_SET(flags, FORK_WAIT));
1761
1762 r = safe_fork_full(name, stdio_fds, except_fds, n_except_fds, flags, &pid);
1763 if (r < 0)
1764 return r;
1765
1766 q = pidref_set_pid(ret_pid, pid);
1767 if (q < 0) /* Let's not fail for this, no matter what, the process exists after all, and that's key */
1768 *ret_pid = PIDREF_MAKE_FROM_PID(pid);
1769
1770 return r;
1771}
1772
27096982
LP
1773int namespace_fork(
1774 const char *outer_name,
1775 const char *inner_name,
85f660d4 1776 int except_fds[],
27096982
LP
1777 size_t n_except_fds,
1778 ForkFlags flags,
1779 int pidns_fd,
1780 int mntns_fd,
1781 int netns_fd,
1782 int userns_fd,
1783 int root_fd,
1784 pid_t *ret_pid) {
1785
1786 int r;
1787
1788 /* This is much like safe_fork(), but forks twice, and joins the specified namespaces in the middle
1789 * process. This ensures that we are fully a member of the destination namespace, with pidns an all, so that
1790 * /proc/self/fd works correctly. */
1791
911f8f01
YW
1792 r = safe_fork_full(outer_name,
1793 NULL,
1794 except_fds, n_except_fds,
e9ccae31 1795 (flags|FORK_DEATHSIG_SIGINT|FORK_DEATHSIG_SIGTERM|FORK_DEATHSIG_SIGKILL) & ~(FORK_REOPEN_LOG|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE), ret_pid);
27096982
LP
1796 if (r < 0)
1797 return r;
1798 if (r == 0) {
1799 pid_t pid;
1800
1801 /* Child */
1802
1803 r = namespace_enter(pidns_fd, mntns_fd, netns_fd, userns_fd, root_fd);
1804 if (r < 0) {
1805 log_full_errno(FLAGS_SET(flags, FORK_LOG) ? LOG_ERR : LOG_DEBUG, r, "Failed to join namespace: %m");
1806 _exit(EXIT_FAILURE);
1807 }
1808
1809 /* We mask a few flags here that either make no sense for the grandchild, or that we don't have to do again */
911f8f01
YW
1810 r = safe_fork_full(inner_name,
1811 NULL,
1812 except_fds, n_except_fds,
1813 flags & ~(FORK_WAIT|FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_REARRANGE_STDIO), &pid);
27096982
LP
1814 if (r < 0)
1815 _exit(EXIT_FAILURE);
1816 if (r == 0) {
1817 /* Child */
1818 if (ret_pid)
1819 *ret_pid = pid;
1820 return 0;
1821 }
1822
1823 r = wait_for_terminate_and_check(inner_name, pid, FLAGS_SET(flags, FORK_LOG) ? WAIT_LOG : 0);
1824 if (r < 0)
1825 _exit(EXIT_FAILURE);
1826
1827 _exit(r);
1828 }
1829
1830 return 1;
1831}
1832
9f8168eb
LP
1833int set_oom_score_adjust(int value) {
1834 char t[DECIMAL_STR_MAX(int)];
1835
3bd6a01c 1836 xsprintf(t, "%i", value);
9f8168eb
LP
1837
1838 return write_string_file("/proc/self/oom_score_adj", t,
1839 WRITE_STRING_FILE_VERIFY_ON_FAILURE|WRITE_STRING_FILE_DISABLE_BUFFER);
1840}
1841
2c37c613 1842int get_oom_score_adjust(int *ret) {
ed90de9f 1843 _cleanup_free_ char *t = NULL;
2c37c613
LP
1844 int r, a;
1845
1846 r = read_virtual_file("/proc/self/oom_score_adj", SIZE_MAX, &t, NULL);
1847 if (r < 0)
1848 return r;
1849
1850 delete_trailing_chars(t, WHITESPACE);
1851
1852 assert_se(safe_atoi(t, &a) >= 0);
1853 assert_se(oom_score_adjust_is_valid(a));
1854
1855 if (ret)
1856 *ret = a;
1857 return 0;
1858}
1859
298f466f
LP
1860int pidfd_get_pid(int fd, pid_t *ret) {
1861 char path[STRLEN("/proc/self/fdinfo/") + DECIMAL_STR_MAX(int)];
1862 _cleanup_free_ char *fdinfo = NULL;
1863 char *p;
1864 int r;
1865
38cdd08b
LP
1866 /* Converts a pidfd into a pid. Well known errors:
1867 *
1868 * -EBADF → fd invalid
1869 * -ENOSYS → /proc/ not mounted
1870 * -ENOTTY → fd valid, but not a pidfd
1871 * -EREMOTE → fd valid, but pid is in another namespace we cannot translate to the local one
1872 * -ESRCH → fd valid, but process is already reaped
1873 */
1874
298f466f
LP
1875 if (fd < 0)
1876 return -EBADF;
1877
1878 xsprintf(path, "/proc/self/fdinfo/%i", fd);
1879
627055ce 1880 r = read_full_virtual_file(path, &fdinfo, NULL);
298f466f 1881 if (r == -ENOENT) /* if fdinfo doesn't exist we assume the process does not exist */
38cdd08b 1882 return proc_mounted() > 0 ? -EBADF : -ENOSYS;
298f466f
LP
1883 if (r < 0)
1884 return r;
1885
50ed5cbf
LP
1886 p = find_line_startswith(fdinfo, "Pid:");
1887 if (!p)
1888 return -ENOTTY; /* not a pidfd? */
298f466f
LP
1889
1890 p += strspn(p, WHITESPACE);
1891 p[strcspn(p, WHITESPACE)] = 0;
1892
38cdd08b
LP
1893 if (streq(p, "0"))
1894 return -EREMOTE; /* PID is in foreign PID namespace? */
1895 if (streq(p, "-1"))
1896 return -ESRCH; /* refers to reaped process? */
1897
298f466f
LP
1898 return parse_pid(p, ret);
1899}
1900
f840c7d5
LB
1901int pidfd_verify_pid(int pidfd, pid_t pid) {
1902 pid_t current_pid;
1903 int r;
1904
1905 assert(pidfd >= 0);
1906 assert(pid > 0);
1907
1908 r = pidfd_get_pid(pidfd, &current_pid);
1909 if (r < 0)
1910 return r;
1911
1912 return current_pid != pid ? -ESRCH : 0;
1913}
1914
39090201
DJL
1915static int rlimit_to_nice(rlim_t limit) {
1916 if (limit <= 1)
1917 return PRIO_MAX-1; /* i.e. 19 */
1918
1919 if (limit >= -PRIO_MIN + PRIO_MAX)
1920 return PRIO_MIN; /* i.e. -20 */
1921
1922 return PRIO_MAX - (int) limit;
1923}
1924
1925int setpriority_closest(int priority) {
1926 int current, limit, saved_errno;
1927 struct rlimit highest;
1928
1929 /* Try to set requested nice level */
1930 if (setpriority(PRIO_PROCESS, 0, priority) >= 0)
1931 return 1;
1932
1933 /* Permission failed */
1934 saved_errno = -errno;
1935 if (!ERRNO_IS_PRIVILEGE(saved_errno))
1936 return saved_errno;
1937
1938 errno = 0;
1939 current = getpriority(PRIO_PROCESS, 0);
1940 if (errno != 0)
1941 return -errno;
1942
1943 if (priority == current)
1944 return 1;
1945
1946 /* Hmm, we'd expect that raising the nice level from our status quo would always work. If it doesn't,
1947 * then the whole setpriority() system call is blocked to us, hence let's propagate the error
1948 * right-away */
1949 if (priority > current)
1950 return saved_errno;
1951
1952 if (getrlimit(RLIMIT_NICE, &highest) < 0)
1953 return -errno;
1954
1955 limit = rlimit_to_nice(highest.rlim_cur);
1956
1957 /* We are already less nice than limit allows us */
1958 if (current < limit) {
1959 log_debug("Cannot raise nice level, permissions and the resource limit do not allow it.");
1960 return 0;
1961 }
1962
1963 /* Push to the allowed limit */
1964 if (setpriority(PRIO_PROCESS, 0, limit) < 0)
1965 return -errno;
1966
1967 log_debug("Cannot set requested nice level (%i), used next best (%i).", priority, limit);
1968 return 0;
1969}
1970
8ddefb8e
LP
1971_noreturn_ void freeze(void) {
1972 log_close();
1973
ab27b2fe
LP
1974 /* Make sure nobody waits for us (i.e. on one of our sockets) anymore. Note that we use
1975 * close_all_fds_without_malloc() instead of plain close_all_fds() here, since we want this function
1976 * to be compatible with being called from signal handlers. */
1977 (void) close_all_fds_without_malloc(NULL, 0);
8ddefb8e 1978
8ddefb8e
LP
1979 /* Let's not freeze right away, but keep reaping zombies. */
1980 for (;;) {
1981 siginfo_t si = {};
1982
1983 if (waitid(P_ALL, 0, &si, WEXITED) < 0 && errno != EINTR)
1984 break;
1985 }
1986
1987 /* waitid() failed with an unexpected error, things are really borked. Freeze now! */
1988 for (;;)
1989 pause();
1990}
1991
6aa90884
LP
1992int get_process_threads(pid_t pid) {
1993 _cleanup_free_ char *t = NULL;
1994 const char *p;
1995 int n, r;
1996
1997 if (pid < 0)
1998 return -EINVAL;
1999
2000 p = procfs_file_alloca(pid, "status");
2001
2002 r = get_proc_field(p, "Threads", WHITESPACE, &t);
2003 if (r == -ENOENT)
2004 return proc_mounted() == 0 ? -ENOSYS : -ESRCH;
2005 if (r < 0)
2006 return r;
2007
2008 r = safe_atoi(t, &n);
2009 if (r < 0)
2010 return r;
2011 if (n < 0)
2012 return -EINVAL;
2013
2014 return n;
2015}
2016
09f9530b
LP
2017int is_reaper_process(void) {
2018 int b = 0;
2019
2020 /* Checks if we are running in a reaper process, i.e. if we are expected to deal with processes
2021 * reparented to us. This simply checks if we are PID 1 or if PR_SET_CHILD_SUBREAPER was called. */
2022
2023 if (getpid_cached() == 1)
2024 return true;
2025
2026 if (prctl(PR_GET_CHILD_SUBREAPER, (unsigned long) &b, 0UL, 0UL, 0UL) < 0)
2027 return -errno;
2028
2029 return b != 0;
2030}
2031
8c3fe1b5
LP
2032int make_reaper_process(bool b) {
2033
2034 if (getpid_cached() == 1) {
2035
2036 if (!b)
2037 return -EINVAL;
2038
2039 return 0;
2040 }
2041
2042 /* Some prctl()s insist that all 5 arguments are specified, others do not. Let's always specify all,
2043 * to avoid any ambiguities */
2044 if (prctl(PR_SET_CHILD_SUBREAPER, (unsigned long) b, 0UL, 0UL, 0UL) < 0)
2045 return -errno;
2046
2047 return 0;
2048}
2049
2e106312
LB
2050DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(posix_spawnattr_t*, posix_spawnattr_destroy, NULL);
2051
c9033540
MY
2052int posix_spawn_wrapper(
2053 const char *path,
2054 char * const *argv,
2055 char * const *envp,
2056 const char *cgroup,
2057 PidRef *ret_pidref) {
2058
2e106312 2059 short flags = POSIX_SPAWN_SETSIGMASK|POSIX_SPAWN_SETSIGDEF;
6ecdfe7d
LB
2060 posix_spawnattr_t attr;
2061 sigset_t mask;
6ecdfe7d
LB
2062 int r;
2063
2064 /* Forks and invokes 'path' with 'argv' and 'envp' using CLONE_VM and CLONE_VFORK, which means the
2065 * caller will be blocked until the child either exits or exec's. The memory of the child will be
2066 * fully shared with the memory of the parent, so that there are no copy-on-write or memory.max
c9033540
MY
2067 * issues.
2068 *
2069 * Also, move the newly-created process into 'cgroup' through POSIX_SPAWN_SETCGROUP (clone3())
2070 * if available. Note that CLONE_INTO_CGROUP is only supported on cgroup v2.
2071 * returns 1: We're already in the right cgroup
2072 * 0: 'cgroup' not specified or POSIX_SPAWN_SETCGROUP is not supported. The caller
2073 * needs to call 'cg_attach' on their own */
6ecdfe7d
LB
2074
2075 assert(path);
2076 assert(argv);
556d2bc4 2077 assert(ret_pidref);
6ecdfe7d
LB
2078
2079 assert_se(sigfillset(&mask) >= 0);
2080
2081 r = posix_spawnattr_init(&attr);
2082 if (r != 0)
2083 return -r; /* These functions return a positive errno on failure */
2e106312
LB
2084
2085 /* Initialization needs to succeed before we can set up a destructor. */
2086 _unused_ _cleanup_(posix_spawnattr_destroyp) posix_spawnattr_t *attr_destructor = &attr;
2087
2088#if HAVE_PIDFD_SPAWN
2089 _cleanup_close_ int cgroup_fd = -EBADF;
2090
2091 if (cgroup) {
2092 _cleanup_free_ char *resolved_cgroup = NULL;
2093
2094 r = cg_get_path_and_check(
2095 SYSTEMD_CGROUP_CONTROLLER,
2096 cgroup,
2097 /* suffix= */ NULL,
2098 &resolved_cgroup);
2099 if (r < 0)
2100 return r;
2101
2102 cgroup_fd = open(resolved_cgroup, O_PATH|O_DIRECTORY|O_CLOEXEC);
2103 if (cgroup_fd < 0)
2104 return -errno;
2105
2106 r = posix_spawnattr_setcgroup_np(&attr, cgroup_fd);
2107 if (r != 0)
2108 return -r;
2109
2110 flags |= POSIX_SPAWN_SETCGROUP;
2111 }
2112#endif
2113
2114 r = posix_spawnattr_setflags(&attr, flags);
6ecdfe7d 2115 if (r != 0)
2e106312 2116 return -r;
6ecdfe7d
LB
2117 r = posix_spawnattr_setsigmask(&attr, &mask);
2118 if (r != 0)
2e106312 2119 return -r;
6ecdfe7d 2120
2e106312
LB
2121#if HAVE_PIDFD_SPAWN
2122 _cleanup_close_ int pidfd = -EBADF;
2123
2124 r = pidfd_spawn(&pidfd, path, NULL, &attr, argv, envp);
2125 if (r == 0) {
2126 r = pidref_set_pidfd_consume(ret_pidref, TAKE_FD(pidfd));
2127 if (r < 0)
2128 return r;
2129
c9033540 2130 return FLAGS_SET(flags, POSIX_SPAWN_SETCGROUP);
2e106312
LB
2131 }
2132 if (!(ERRNO_IS_NOT_SUPPORTED(r) || ERRNO_IS_PRIVILEGE(r)))
2133 return -r;
2134
2135 /* Compiled on a newer host, or seccomp&friends blocking clone3()? Fallback, but need to change the
2136 * flags to remove the cgroup one, which is what redirects to clone3() */
2137 flags &= ~POSIX_SPAWN_SETCGROUP;
2138 r = posix_spawnattr_setflags(&attr, flags);
6ecdfe7d 2139 if (r != 0)
2e106312
LB
2140 return -r;
2141#endif
6ecdfe7d 2142
2e106312
LB
2143 pid_t pid;
2144 r = posix_spawn(&pid, path, NULL, &attr, argv, envp);
2145 if (r != 0)
2146 return -r;
556d2bc4 2147
2e106312
LB
2148 r = pidref_set_pid(ret_pidref, pid);
2149 if (r < 0)
2150 return r;
6ecdfe7d 2151
2e106312 2152 return 0; /* We did not use CLONE_INTO_CGROUP so return 0, the caller will have to move the child */
6ecdfe7d
LB
2153}
2154
eefb7d22
LP
2155int proc_dir_open(DIR **ret) {
2156 DIR *d;
2157
2158 assert(ret);
2159
2160 d = opendir("/proc");
2161 if (!d)
2162 return -errno;
2163
2164 *ret = d;
2165 return 0;
2166}
2167
2168int proc_dir_read(DIR *d, pid_t *ret) {
2169 assert(d);
2170
2171 for (;;) {
2172 struct dirent *de;
2173
2174 errno = 0;
2175 de = readdir_no_dot(d);
2176 if (!de) {
2177 if (errno != 0)
2178 return -errno;
2179
2180 break;
2181 }
2182
2183 if (!IN_SET(de->d_type, DT_DIR, DT_UNKNOWN))
2184 continue;
2185
2186 if (parse_pid(de->d_name, ret) >= 0)
2187 return 1;
2188 }
2189
2190 if (ret)
2191 *ret = 0;
2192 return 0;
2193}
2194
2195int proc_dir_read_pidref(DIR *d, PidRef *ret) {
2196 int r;
2197
2198 assert(d);
2199
2200 for (;;) {
2201 pid_t pid;
2202
2203 r = proc_dir_read(d, &pid);
2204 if (r < 0)
2205 return r;
2206 if (r == 0)
2207 break;
2208
2209 r = pidref_set_pid(ret, pid);
2210 if (r == -ESRCH) /* gone by now? skip it */
2211 continue;
2212 if (r < 0)
2213 return r;
2214
2215 return 1;
2216 }
2217
2218 if (ret)
2219 *ret = PIDREF_NULL;
2220 return 0;
2221}
2222
7b3e062c
LP
2223static const char *const sigchld_code_table[] = {
2224 [CLD_EXITED] = "exited",
2225 [CLD_KILLED] = "killed",
2226 [CLD_DUMPED] = "dumped",
2227 [CLD_TRAPPED] = "trapped",
2228 [CLD_STOPPED] = "stopped",
2229 [CLD_CONTINUED] = "continued",
2230};
2231
2232DEFINE_STRING_TABLE_LOOKUP(sigchld_code, int);
2233
2234static const char* const sched_policy_table[] = {
2235 [SCHED_OTHER] = "other",
2236 [SCHED_BATCH] = "batch",
2237 [SCHED_IDLE] = "idle",
2238 [SCHED_FIFO] = "fifo",
f44b3035 2239 [SCHED_RR] = "rr",
7b3e062c
LP
2240};
2241
2242DEFINE_STRING_TABLE_LOOKUP_WITH_FALLBACK(sched_policy, int, INT_MAX);