]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/basic/process-util.c
doc: mention that units can be masked via credentials
[thirdparty/systemd.git] / src / basic / process-util.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <ctype.h>
4 #include <errno.h>
5 #include <limits.h>
6 #include <linux/oom.h>
7 #include <pthread.h>
8 #include <spawn.h>
9 #include <stdbool.h>
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <sys/mount.h>
13 #include <sys/personality.h>
14 #include <sys/prctl.h>
15 #include <sys/types.h>
16 #include <sys/wait.h>
17 #include <syslog.h>
18 #include <unistd.h>
19 #if HAVE_VALGRIND_VALGRIND_H
20 #include <valgrind/valgrind.h>
21 #endif
22
23 #include "sd-messages.h"
24
25 #include "alloc-util.h"
26 #include "architecture.h"
27 #include "argv-util.h"
28 #include "cgroup-util.h"
29 #include "dirent-util.h"
30 #include "env-file.h"
31 #include "env-util.h"
32 #include "errno-util.h"
33 #include "escape.h"
34 #include "fd-util.h"
35 #include "fileio.h"
36 #include "fs-util.h"
37 #include "hostname-util.h"
38 #include "locale-util.h"
39 #include "log.h"
40 #include "macro.h"
41 #include "memory-util.h"
42 #include "missing_sched.h"
43 #include "missing_syscall.h"
44 #include "missing_threads.h"
45 #include "mountpoint-util.h"
46 #include "namespace-util.h"
47 #include "nulstr-util.h"
48 #include "parse-util.h"
49 #include "path-util.h"
50 #include "process-util.h"
51 #include "raw-clone.h"
52 #include "rlimit-util.h"
53 #include "signal-util.h"
54 #include "stat-util.h"
55 #include "stdio-util.h"
56 #include "string-table.h"
57 #include "string-util.h"
58 #include "terminal-util.h"
59 #include "user-util.h"
60 #include "utf8.h"
61
62 /* The kernel limits userspace processes to TASK_COMM_LEN (16 bytes), but allows higher values for its own
63 * workers, e.g. "kworker/u9:3-kcryptd/253:0". Let's pick a fixed smallish limit that will work for the kernel.
64 */
65 #define COMM_MAX_LEN 128
66
67 static int get_process_state(pid_t pid) {
68 _cleanup_free_ char *line = NULL;
69 const char *p;
70 char state;
71 int r;
72
73 assert(pid >= 0);
74
75 /* Shortcut: if we are enquired about our own state, we are obviously running */
76 if (pid == 0 || pid == getpid_cached())
77 return (unsigned char) 'R';
78
79 p = procfs_file_alloca(pid, "stat");
80
81 r = read_one_line_file(p, &line);
82 if (r == -ENOENT)
83 return -ESRCH;
84 if (r < 0)
85 return r;
86
87 p = strrchr(line, ')');
88 if (!p)
89 return -EIO;
90
91 p++;
92
93 if (sscanf(p, " %c", &state) != 1)
94 return -EIO;
95
96 return (unsigned char) state;
97 }
98
99 int pid_get_comm(pid_t pid, char **ret) {
100 _cleanup_free_ char *escaped = NULL, *comm = NULL;
101 int r;
102
103 assert(ret);
104 assert(pid >= 0);
105
106 if (pid == 0 || pid == getpid_cached()) {
107 comm = new0(char, TASK_COMM_LEN + 1); /* Must fit in 16 byte according to prctl(2) */
108 if (!comm)
109 return -ENOMEM;
110
111 if (prctl(PR_GET_NAME, comm) < 0)
112 return -errno;
113 } else {
114 const char *p;
115
116 p = procfs_file_alloca(pid, "comm");
117
118 /* Note that process names of kernel threads can be much longer than TASK_COMM_LEN */
119 r = read_one_line_file(p, &comm);
120 if (r == -ENOENT)
121 return -ESRCH;
122 if (r < 0)
123 return r;
124 }
125
126 escaped = new(char, COMM_MAX_LEN);
127 if (!escaped)
128 return -ENOMEM;
129
130 /* Escape unprintable characters, just in case, but don't grow the string beyond the underlying size */
131 cellescape(escaped, COMM_MAX_LEN, comm);
132
133 *ret = TAKE_PTR(escaped);
134 return 0;
135 }
136
137 int pidref_get_comm(const PidRef *pid, char **ret) {
138 _cleanup_free_ char *comm = NULL;
139 int r;
140
141 if (!pidref_is_set(pid))
142 return -ESRCH;
143
144 r = pid_get_comm(pid->pid, &comm);
145 if (r < 0)
146 return r;
147
148 r = pidref_verify(pid);
149 if (r < 0)
150 return r;
151
152 if (ret)
153 *ret = TAKE_PTR(comm);
154 return 0;
155 }
156
157 static int pid_get_cmdline_nulstr(
158 pid_t pid,
159 size_t max_size,
160 ProcessCmdlineFlags flags,
161 char **ret,
162 size_t *ret_size) {
163
164 _cleanup_free_ char *t = NULL;
165 const char *p;
166 size_t k;
167 int r;
168
169 /* Retrieves a process' command line as a "sized nulstr", i.e. possibly without the last NUL, but
170 * with a specified size.
171 *
172 * If PROCESS_CMDLINE_COMM_FALLBACK is specified in flags and the process has no command line set
173 * (the case for kernel threads), or has a command line that resolves to the empty string, will
174 * return the "comm" name of the process instead. This will use at most _SC_ARG_MAX bytes of input
175 * data.
176 *
177 * Returns an error, 0 if output was read but is truncated, 1 otherwise.
178 */
179
180 p = procfs_file_alloca(pid, "cmdline");
181 r = read_virtual_file(p, max_size, &t, &k); /* Let's assume that each input byte results in >= 1
182 * columns of output. We ignore zero-width codepoints. */
183 if (r == -ENOENT)
184 return -ESRCH;
185 if (r < 0)
186 return r;
187
188 if (k == 0) {
189 if (!(flags & PROCESS_CMDLINE_COMM_FALLBACK))
190 return -ENOENT;
191
192 /* Kernel threads have no argv[] */
193 _cleanup_free_ char *comm = NULL;
194
195 r = pid_get_comm(pid, &comm);
196 if (r < 0)
197 return r;
198
199 free(t);
200 t = strjoin("[", comm, "]");
201 if (!t)
202 return -ENOMEM;
203
204 k = strlen(t);
205 r = k <= max_size;
206 if (r == 0) /* truncation */
207 t[max_size] = '\0';
208 }
209
210 if (ret)
211 *ret = TAKE_PTR(t);
212 if (ret_size)
213 *ret_size = k;
214
215 return r;
216 }
217
218 int pid_get_cmdline(pid_t pid, size_t max_columns, ProcessCmdlineFlags flags, char **ret) {
219 _cleanup_free_ char *t = NULL;
220 size_t k;
221 char *ans;
222
223 assert(pid >= 0);
224 assert(ret);
225
226 /* Retrieve and format a command line. See above for discussion of retrieval options.
227 *
228 * There are two main formatting modes:
229 *
230 * - when PROCESS_CMDLINE_QUOTE is specified, output is quoted in C/Python style. If no shell special
231 * characters are present, this output can be copy-pasted into the terminal to execute. UTF-8
232 * output is assumed.
233 *
234 * - otherwise, a compact non-roundtrippable form is returned. Non-UTF8 bytes are replaced by �. The
235 * returned string is of the specified console width at most, abbreviated with an ellipsis.
236 *
237 * Returns -ESRCH if the process doesn't exist, and -ENOENT if the process has no command line (and
238 * PROCESS_CMDLINE_COMM_FALLBACK is not specified). Returns 0 and sets *line otherwise. */
239
240 int full = pid_get_cmdline_nulstr(pid, max_columns, flags, &t, &k);
241 if (full < 0)
242 return full;
243
244 if (flags & (PROCESS_CMDLINE_QUOTE | PROCESS_CMDLINE_QUOTE_POSIX)) {
245 ShellEscapeFlags shflags = SHELL_ESCAPE_EMPTY |
246 FLAGS_SET(flags, PROCESS_CMDLINE_QUOTE_POSIX) * SHELL_ESCAPE_POSIX;
247
248 assert(!(flags & PROCESS_CMDLINE_USE_LOCALE));
249
250 _cleanup_strv_free_ char **args = NULL;
251
252 /* Drop trailing NULs, otherwise strv_parse_nulstr() adds additional empty strings at the end.
253 * See also issue #21186. */
254 args = strv_parse_nulstr_full(t, k, /* drop_trailing_nuls = */ true);
255 if (!args)
256 return -ENOMEM;
257
258 ans = quote_command_line(args, shflags);
259 if (!ans)
260 return -ENOMEM;
261 } else {
262 /* Arguments are separated by NULs. Let's replace those with spaces. */
263 for (size_t i = 0; i < k - 1; i++)
264 if (t[i] == '\0')
265 t[i] = ' ';
266
267 delete_trailing_chars(t, WHITESPACE);
268
269 bool eight_bit = (flags & PROCESS_CMDLINE_USE_LOCALE) && !is_locale_utf8();
270
271 ans = escape_non_printable_full(t, max_columns,
272 eight_bit * XESCAPE_8_BIT | !full * XESCAPE_FORCE_ELLIPSIS);
273 if (!ans)
274 return -ENOMEM;
275
276 ans = str_realloc(ans);
277 }
278
279 *ret = ans;
280 return 0;
281 }
282
283 int pidref_get_cmdline(const PidRef *pid, size_t max_columns, ProcessCmdlineFlags flags, char **ret) {
284 _cleanup_free_ char *s = NULL;
285 int r;
286
287 if (!pidref_is_set(pid))
288 return -ESRCH;
289
290 r = pid_get_cmdline(pid->pid, max_columns, flags, &s);
291 if (r < 0)
292 return r;
293
294 r = pidref_verify(pid);
295 if (r < 0)
296 return r;
297
298 if (ret)
299 *ret = TAKE_PTR(s);
300 return 0;
301 }
302
303 int pid_get_cmdline_strv(pid_t pid, ProcessCmdlineFlags flags, char ***ret) {
304 _cleanup_free_ char *t = NULL;
305 char **args;
306 size_t k;
307 int r;
308
309 assert(pid >= 0);
310 assert((flags & ~PROCESS_CMDLINE_COMM_FALLBACK) == 0);
311 assert(ret);
312
313 r = pid_get_cmdline_nulstr(pid, SIZE_MAX, flags, &t, &k);
314 if (r < 0)
315 return r;
316
317 args = strv_parse_nulstr_full(t, k, /* drop_trailing_nuls = */ true);
318 if (!args)
319 return -ENOMEM;
320
321 *ret = args;
322 return 0;
323 }
324
325 int pidref_get_cmdline_strv(const PidRef *pid, ProcessCmdlineFlags flags, char ***ret) {
326 _cleanup_strv_free_ char **args = NULL;
327 int r;
328
329 if (!pidref_is_set(pid))
330 return -ESRCH;
331
332 r = pid_get_cmdline_strv(pid->pid, flags, &args);
333 if (r < 0)
334 return r;
335
336 r = pidref_verify(pid);
337 if (r < 0)
338 return r;
339
340 if (ret)
341 *ret = TAKE_PTR(args);
342
343 return 0;
344 }
345
346 int container_get_leader(const char *machine, pid_t *pid) {
347 _cleanup_free_ char *s = NULL, *class = NULL;
348 const char *p;
349 pid_t leader;
350 int r;
351
352 assert(machine);
353 assert(pid);
354
355 if (streq(machine, ".host")) {
356 *pid = 1;
357 return 0;
358 }
359
360 if (!hostname_is_valid(machine, 0))
361 return -EINVAL;
362
363 p = strjoina("/run/systemd/machines/", machine);
364 r = parse_env_file(NULL, p,
365 "LEADER", &s,
366 "CLASS", &class);
367 if (r == -ENOENT)
368 return -EHOSTDOWN;
369 if (r < 0)
370 return r;
371 if (!s)
372 return -EIO;
373
374 if (!streq_ptr(class, "container"))
375 return -EIO;
376
377 r = parse_pid(s, &leader);
378 if (r < 0)
379 return r;
380 if (leader <= 1)
381 return -EIO;
382
383 *pid = leader;
384 return 0;
385 }
386
387 int namespace_get_leader(pid_t pid, NamespaceType type, pid_t *ret) {
388 int r;
389
390 assert(ret);
391
392 for (;;) {
393 pid_t ppid;
394
395 r = get_process_ppid(pid, &ppid);
396 if (r < 0)
397 return r;
398
399 r = in_same_namespace(pid, ppid, type);
400 if (r < 0)
401 return r;
402 if (r == 0) {
403 /* If the parent and the child are not in the same
404 * namespace, then the child is the leader we are
405 * looking for. */
406 *ret = pid;
407 return 0;
408 }
409
410 pid = ppid;
411 }
412 }
413
414 int pid_is_kernel_thread(pid_t pid) {
415 _cleanup_free_ char *line = NULL;
416 unsigned long long flags;
417 size_t l, i;
418 const char *p;
419 char *q;
420 int r;
421
422 if (IN_SET(pid, 0, 1) || pid == getpid_cached()) /* pid 1, and we ourselves certainly aren't a kernel thread */
423 return 0;
424 if (!pid_is_valid(pid))
425 return -EINVAL;
426
427 p = procfs_file_alloca(pid, "stat");
428 r = read_one_line_file(p, &line);
429 if (r == -ENOENT)
430 return -ESRCH;
431 if (r < 0)
432 return r;
433
434 /* Skip past the comm field */
435 q = strrchr(line, ')');
436 if (!q)
437 return -EINVAL;
438 q++;
439
440 /* Skip 6 fields to reach the flags field */
441 for (i = 0; i < 6; i++) {
442 l = strspn(q, WHITESPACE);
443 if (l < 1)
444 return -EINVAL;
445 q += l;
446
447 l = strcspn(q, WHITESPACE);
448 if (l < 1)
449 return -EINVAL;
450 q += l;
451 }
452
453 /* Skip preceding whitespace */
454 l = strspn(q, WHITESPACE);
455 if (l < 1)
456 return -EINVAL;
457 q += l;
458
459 /* Truncate the rest */
460 l = strcspn(q, WHITESPACE);
461 if (l < 1)
462 return -EINVAL;
463 q[l] = 0;
464
465 r = safe_atollu(q, &flags);
466 if (r < 0)
467 return r;
468
469 return !!(flags & PF_KTHREAD);
470 }
471
472 int pidref_is_kernel_thread(const PidRef *pid) {
473 int result, r;
474
475 if (!pidref_is_set(pid))
476 return -ESRCH;
477
478 result = pid_is_kernel_thread(pid->pid);
479 if (result < 0)
480 return result;
481
482 r = pidref_verify(pid); /* Verify that the PID wasn't reused since */
483 if (r < 0)
484 return r;
485
486 return result;
487 }
488
489 int get_process_capeff(pid_t pid, char **ret) {
490 const char *p;
491 int r;
492
493 assert(pid >= 0);
494 assert(ret);
495
496 p = procfs_file_alloca(pid, "status");
497
498 r = get_proc_field(p, "CapEff", WHITESPACE, ret);
499 if (r == -ENOENT)
500 return -ESRCH;
501
502 return r;
503 }
504
505 static int get_process_link_contents(pid_t pid, const char *proc_file, char **ret) {
506 const char *p;
507 int r;
508
509 assert(proc_file);
510
511 p = procfs_file_alloca(pid, proc_file);
512
513 r = readlink_malloc(p, ret);
514 return (r == -ENOENT && proc_mounted() > 0) ? -ESRCH : r;
515 }
516
517 int get_process_exe(pid_t pid, char **ret) {
518 char *d;
519 int r;
520
521 assert(pid >= 0);
522
523 r = get_process_link_contents(pid, "exe", ret);
524 if (r < 0)
525 return r;
526
527 if (ret) {
528 d = endswith(*ret, " (deleted)");
529 if (d)
530 *d = '\0';
531 }
532
533 return 0;
534 }
535
536 static int get_process_id(pid_t pid, const char *field, uid_t *ret) {
537 _cleanup_fclose_ FILE *f = NULL;
538 const char *p;
539 int r;
540
541 assert(field);
542 assert(ret);
543
544 if (pid < 0)
545 return -EINVAL;
546
547 p = procfs_file_alloca(pid, "status");
548 r = fopen_unlocked(p, "re", &f);
549 if (r == -ENOENT)
550 return -ESRCH;
551 if (r < 0)
552 return r;
553
554 for (;;) {
555 _cleanup_free_ char *line = NULL;
556 char *l;
557
558 r = read_stripped_line(f, LONG_LINE_MAX, &line);
559 if (r < 0)
560 return r;
561 if (r == 0)
562 break;
563
564 l = startswith(line, field);
565 if (l) {
566 l += strspn(l, WHITESPACE);
567
568 l[strcspn(l, WHITESPACE)] = 0;
569
570 return parse_uid(l, ret);
571 }
572 }
573
574 return -EIO;
575 }
576
577 int pid_get_uid(pid_t pid, uid_t *ret) {
578 assert(ret);
579
580 if (pid == 0 || pid == getpid_cached()) {
581 *ret = getuid();
582 return 0;
583 }
584
585 return get_process_id(pid, "Uid:", ret);
586 }
587
588 int pidref_get_uid(const PidRef *pid, uid_t *ret) {
589 uid_t uid;
590 int r;
591
592 if (!pidref_is_set(pid))
593 return -ESRCH;
594
595 r = pid_get_uid(pid->pid, &uid);
596 if (r < 0)
597 return r;
598
599 r = pidref_verify(pid);
600 if (r < 0)
601 return r;
602
603 if (ret)
604 *ret = uid;
605 return 0;
606 }
607
608 int get_process_gid(pid_t pid, gid_t *ret) {
609
610 if (pid == 0 || pid == getpid_cached()) {
611 *ret = getgid();
612 return 0;
613 }
614
615 assert_cc(sizeof(uid_t) == sizeof(gid_t));
616 return get_process_id(pid, "Gid:", ret);
617 }
618
619 int get_process_cwd(pid_t pid, char **ret) {
620 assert(pid >= 0);
621
622 if (pid == 0 || pid == getpid_cached())
623 return safe_getcwd(ret);
624
625 return get_process_link_contents(pid, "cwd", ret);
626 }
627
628 int get_process_root(pid_t pid, char **ret) {
629 assert(pid >= 0);
630 return get_process_link_contents(pid, "root", ret);
631 }
632
633 #define ENVIRONMENT_BLOCK_MAX (5U*1024U*1024U)
634
635 int get_process_environ(pid_t pid, char **ret) {
636 _cleanup_fclose_ FILE *f = NULL;
637 _cleanup_free_ char *outcome = NULL;
638 size_t sz = 0;
639 const char *p;
640 int r;
641
642 assert(pid >= 0);
643 assert(ret);
644
645 p = procfs_file_alloca(pid, "environ");
646
647 r = fopen_unlocked(p, "re", &f);
648 if (r == -ENOENT)
649 return -ESRCH;
650 if (r < 0)
651 return r;
652
653 for (;;) {
654 char c;
655
656 if (sz >= ENVIRONMENT_BLOCK_MAX)
657 return -ENOBUFS;
658
659 if (!GREEDY_REALLOC(outcome, sz + 5))
660 return -ENOMEM;
661
662 r = safe_fgetc(f, &c);
663 if (r < 0)
664 return r;
665 if (r == 0)
666 break;
667
668 if (c == '\0')
669 outcome[sz++] = '\n';
670 else
671 sz += cescape_char(c, outcome + sz);
672 }
673
674 outcome[sz] = '\0';
675 *ret = TAKE_PTR(outcome);
676
677 return 0;
678 }
679
680 int get_process_ppid(pid_t pid, pid_t *ret) {
681 _cleanup_free_ char *line = NULL;
682 unsigned long ppid;
683 const char *p;
684 int r;
685
686 assert(pid >= 0);
687
688 if (pid == 0 || pid == getpid_cached()) {
689 if (ret)
690 *ret = getppid();
691 return 0;
692 }
693
694 if (pid == 1) /* PID 1 has no parent, shortcut this case */
695 return -EADDRNOTAVAIL;
696
697 p = procfs_file_alloca(pid, "stat");
698 r = read_one_line_file(p, &line);
699 if (r == -ENOENT)
700 return -ESRCH;
701 if (r < 0)
702 return r;
703
704 /* Let's skip the pid and comm fields. The latter is enclosed in () but does not escape any () in its
705 * value, so let's skip over it manually */
706
707 p = strrchr(line, ')');
708 if (!p)
709 return -EIO;
710
711 p++;
712
713 if (sscanf(p, " "
714 "%*c " /* state */
715 "%lu ", /* ppid */
716 &ppid) != 1)
717 return -EIO;
718
719 /* If ppid is zero the process has no parent. Which might be the case for PID 1 but also for
720 * processes originating in other namespaces that are inserted into a pidns. Return a recognizable
721 * error in this case. */
722 if (ppid == 0)
723 return -EADDRNOTAVAIL;
724
725 if ((pid_t) ppid < 0 || (unsigned long) (pid_t) ppid != ppid)
726 return -ERANGE;
727
728 if (ret)
729 *ret = (pid_t) ppid;
730
731 return 0;
732 }
733
734 int pid_get_start_time(pid_t pid, uint64_t *ret) {
735 _cleanup_free_ char *line = NULL;
736 const char *p;
737 int r;
738
739 assert(pid >= 0);
740
741 p = procfs_file_alloca(pid, "stat");
742 r = read_one_line_file(p, &line);
743 if (r == -ENOENT)
744 return -ESRCH;
745 if (r < 0)
746 return r;
747
748 /* Let's skip the pid and comm fields. The latter is enclosed in () but does not escape any () in its
749 * value, so let's skip over it manually */
750
751 p = strrchr(line, ')');
752 if (!p)
753 return -EIO;
754
755 p++;
756
757 unsigned long llu;
758
759 if (sscanf(p, " "
760 "%*c " /* state */
761 "%*u " /* ppid */
762 "%*u " /* pgrp */
763 "%*u " /* session */
764 "%*u " /* tty_nr */
765 "%*u " /* tpgid */
766 "%*u " /* flags */
767 "%*u " /* minflt */
768 "%*u " /* cminflt */
769 "%*u " /* majflt */
770 "%*u " /* cmajflt */
771 "%*u " /* utime */
772 "%*u " /* stime */
773 "%*u " /* cutime */
774 "%*u " /* cstime */
775 "%*i " /* priority */
776 "%*i " /* nice */
777 "%*u " /* num_threads */
778 "%*u " /* itrealvalue */
779 "%lu ", /* starttime */
780 &llu) != 1)
781 return -EIO;
782
783 if (ret)
784 *ret = llu;
785
786 return 0;
787 }
788
789 int pidref_get_start_time(const PidRef *pid, uint64_t *ret) {
790 uint64_t t;
791 int r;
792
793 if (!pidref_is_set(pid))
794 return -ESRCH;
795
796 r = pid_get_start_time(pid->pid, ret ? &t : NULL);
797 if (r < 0)
798 return r;
799
800 r = pidref_verify(pid);
801 if (r < 0)
802 return r;
803
804 if (ret)
805 *ret = t;
806
807 return 0;
808 }
809
810 int get_process_umask(pid_t pid, mode_t *ret) {
811 _cleanup_free_ char *m = NULL;
812 const char *p;
813 int r;
814
815 assert(pid >= 0);
816 assert(ret);
817
818 p = procfs_file_alloca(pid, "status");
819
820 r = get_proc_field(p, "Umask", WHITESPACE, &m);
821 if (r == -ENOENT)
822 return -ESRCH;
823 if (r < 0)
824 return r;
825
826 return parse_mode(m, ret);
827 }
828
829 int wait_for_terminate(pid_t pid, siginfo_t *status) {
830 siginfo_t dummy;
831
832 assert(pid >= 1);
833
834 if (!status)
835 status = &dummy;
836
837 for (;;) {
838 zero(*status);
839
840 if (waitid(P_PID, pid, status, WEXITED) < 0) {
841
842 if (errno == EINTR)
843 continue;
844
845 return negative_errno();
846 }
847
848 return 0;
849 }
850 }
851
852 /*
853 * Return values:
854 * < 0 : wait_for_terminate() failed to get the state of the
855 * process, the process was terminated by a signal, or
856 * failed for an unknown reason.
857 * >=0 : The process terminated normally, and its exit code is
858 * returned.
859 *
860 * That is, success is indicated by a return value of zero, and an
861 * error is indicated by a non-zero value.
862 *
863 * A warning is emitted if the process terminates abnormally,
864 * and also if it returns non-zero unless check_exit_code is true.
865 */
866 int wait_for_terminate_and_check(const char *name, pid_t pid, WaitFlags flags) {
867 _cleanup_free_ char *buffer = NULL;
868 siginfo_t status;
869 int r, prio;
870
871 assert(pid > 1);
872
873 if (!name) {
874 r = pid_get_comm(pid, &buffer);
875 if (r < 0)
876 log_debug_errno(r, "Failed to acquire process name of " PID_FMT ", ignoring: %m", pid);
877 else
878 name = buffer;
879 }
880
881 prio = flags & WAIT_LOG_ABNORMAL ? LOG_ERR : LOG_DEBUG;
882
883 r = wait_for_terminate(pid, &status);
884 if (r < 0)
885 return log_full_errno(prio, r, "Failed to wait for %s: %m", strna(name));
886
887 if (status.si_code == CLD_EXITED) {
888 if (status.si_status != EXIT_SUCCESS)
889 log_full(flags & WAIT_LOG_NON_ZERO_EXIT_STATUS ? LOG_ERR : LOG_DEBUG,
890 "%s failed with exit status %i.", strna(name), status.si_status);
891 else
892 log_debug("%s succeeded.", name);
893
894 return status.si_status;
895
896 } else if (IN_SET(status.si_code, CLD_KILLED, CLD_DUMPED)) {
897
898 log_full(prio, "%s terminated by signal %s.", strna(name), signal_to_string(status.si_status));
899 return -EPROTO;
900 }
901
902 log_full(prio, "%s failed due to unknown reason.", strna(name));
903 return -EPROTO;
904 }
905
906 /*
907 * Return values:
908 *
909 * < 0 : wait_for_terminate_with_timeout() failed to get the state of the process, the process timed out, the process
910 * was terminated by a signal, or failed for an unknown reason.
911 *
912 * >=0 : The process terminated normally with no failures.
913 *
914 * Success is indicated by a return value of zero, a timeout is indicated by ETIMEDOUT, and all other child failure
915 * states are indicated by error is indicated by a non-zero value.
916 *
917 * This call assumes SIGCHLD has been blocked already, in particular before the child to wait for has been forked off
918 * to remain entirely race-free.
919 */
920 int wait_for_terminate_with_timeout(pid_t pid, usec_t timeout) {
921 sigset_t mask;
922 int r;
923 usec_t until;
924
925 assert_se(sigemptyset(&mask) == 0);
926 assert_se(sigaddset(&mask, SIGCHLD) == 0);
927
928 /* Drop into a sigtimewait-based timeout. Waiting for the
929 * pid to exit. */
930 until = usec_add(now(CLOCK_MONOTONIC), timeout);
931 for (;;) {
932 usec_t n;
933 siginfo_t status = {};
934
935 n = now(CLOCK_MONOTONIC);
936 if (n >= until)
937 break;
938
939 r = RET_NERRNO(sigtimedwait(&mask, NULL, TIMESPEC_STORE(until - n)));
940 /* Assuming we woke due to the child exiting. */
941 if (waitid(P_PID, pid, &status, WEXITED|WNOHANG) == 0) {
942 if (status.si_pid == pid) {
943 /* This is the correct child. */
944 if (status.si_code == CLD_EXITED)
945 return status.si_status == 0 ? 0 : -EPROTO;
946 else
947 return -EPROTO;
948 }
949 }
950 /* Not the child, check for errors and proceed appropriately */
951 if (r < 0) {
952 switch (r) {
953 case -EAGAIN:
954 /* Timed out, child is likely hung. */
955 return -ETIMEDOUT;
956 case -EINTR:
957 /* Received a different signal and should retry */
958 continue;
959 default:
960 /* Return any unexpected errors */
961 return r;
962 }
963 }
964 }
965
966 return -EPROTO;
967 }
968
969 void sigkill_wait(pid_t pid) {
970 assert(pid > 1);
971
972 (void) kill(pid, SIGKILL);
973 (void) wait_for_terminate(pid, NULL);
974 }
975
976 void sigkill_waitp(pid_t *pid) {
977 PROTECT_ERRNO;
978
979 if (!pid)
980 return;
981 if (*pid <= 1)
982 return;
983
984 sigkill_wait(*pid);
985 }
986
987 void sigterm_wait(pid_t pid) {
988 assert(pid > 1);
989
990 (void) kill_and_sigcont(pid, SIGTERM);
991 (void) wait_for_terminate(pid, NULL);
992 }
993
994 void sigkill_nowait(pid_t pid) {
995 assert(pid > 1);
996
997 (void) kill(pid, SIGKILL);
998 }
999
1000 void sigkill_nowaitp(pid_t *pid) {
1001 PROTECT_ERRNO;
1002
1003 if (!pid)
1004 return;
1005 if (*pid <= 1)
1006 return;
1007
1008 sigkill_nowait(*pid);
1009 }
1010
1011 int kill_and_sigcont(pid_t pid, int sig) {
1012 int r;
1013
1014 r = RET_NERRNO(kill(pid, sig));
1015
1016 /* If this worked, also send SIGCONT, unless we already just sent a SIGCONT, or SIGKILL was sent which isn't
1017 * affected by a process being suspended anyway. */
1018 if (r >= 0 && !IN_SET(sig, SIGCONT, SIGKILL))
1019 (void) kill(pid, SIGCONT);
1020
1021 return r;
1022 }
1023
1024 int getenv_for_pid(pid_t pid, const char *field, char **ret) {
1025 _cleanup_fclose_ FILE *f = NULL;
1026 const char *path;
1027 size_t sum = 0;
1028 int r;
1029
1030 assert(pid >= 0);
1031 assert(field);
1032 assert(ret);
1033
1034 if (pid == 0 || pid == getpid_cached())
1035 return strdup_to_full(ret, getenv(field));
1036
1037 if (!pid_is_valid(pid))
1038 return -EINVAL;
1039
1040 path = procfs_file_alloca(pid, "environ");
1041
1042 r = fopen_unlocked(path, "re", &f);
1043 if (r == -ENOENT)
1044 return -ESRCH;
1045 if (r < 0)
1046 return r;
1047
1048 for (;;) {
1049 _cleanup_free_ char *line = NULL;
1050 const char *match;
1051
1052 if (sum > ENVIRONMENT_BLOCK_MAX) /* Give up searching eventually */
1053 return -ENOBUFS;
1054
1055 r = read_nul_string(f, LONG_LINE_MAX, &line);
1056 if (r < 0)
1057 return r;
1058 if (r == 0) /* EOF */
1059 break;
1060
1061 sum += r;
1062
1063 match = startswith(line, field);
1064 if (match && *match == '=')
1065 return strdup_to_full(ret, match + 1);
1066 }
1067
1068 *ret = NULL;
1069 return 0;
1070 }
1071
1072 int pid_is_my_child(pid_t pid) {
1073 pid_t ppid;
1074 int r;
1075
1076 if (pid < 0)
1077 return -ESRCH;
1078
1079 if (pid <= 1)
1080 return false;
1081
1082 r = get_process_ppid(pid, &ppid);
1083 if (r < 0)
1084 return r;
1085
1086 return ppid == getpid_cached();
1087 }
1088
1089 int pidref_is_my_child(const PidRef *pid) {
1090 int r, result;
1091
1092 if (!pidref_is_set(pid))
1093 return -ESRCH;
1094
1095 result = pid_is_my_child(pid->pid);
1096 if (result < 0)
1097 return result;
1098
1099 r = pidref_verify(pid);
1100 if (r < 0)
1101 return r;
1102
1103 return result;
1104 }
1105
1106 int pid_is_unwaited(pid_t pid) {
1107 /* Checks whether a PID is still valid at all, including a zombie */
1108
1109 if (pid < 0)
1110 return -ESRCH;
1111
1112 if (pid <= 1) /* If we or PID 1 would be dead and have been waited for, this code would not be running */
1113 return true;
1114
1115 if (pid == getpid_cached())
1116 return true;
1117
1118 if (kill(pid, 0) >= 0)
1119 return true;
1120
1121 return errno != ESRCH;
1122 }
1123
1124 int pidref_is_unwaited(const PidRef *pid) {
1125 int r;
1126
1127 if (!pidref_is_set(pid))
1128 return -ESRCH;
1129
1130 if (pid->pid == 1 || pidref_is_self(pid))
1131 return true;
1132
1133 r = pidref_kill(pid, 0);
1134 if (r == -ESRCH)
1135 return false;
1136 if (r < 0)
1137 return r;
1138
1139 return true;
1140 }
1141
1142 int pid_is_alive(pid_t pid) {
1143 int r;
1144
1145 /* Checks whether a PID is still valid and not a zombie */
1146
1147 if (pid < 0)
1148 return -ESRCH;
1149
1150 if (pid <= 1) /* If we or PID 1 would be a zombie, this code would not be running */
1151 return true;
1152
1153 if (pid == getpid_cached())
1154 return true;
1155
1156 r = get_process_state(pid);
1157 if (r == -ESRCH)
1158 return false;
1159 if (r < 0)
1160 return r;
1161
1162 return r != 'Z';
1163 }
1164
1165 int pidref_is_alive(const PidRef *pidref) {
1166 int r, result;
1167
1168 if (!pidref_is_set(pidref))
1169 return -ESRCH;
1170
1171 result = pid_is_alive(pidref->pid);
1172 if (result < 0) {
1173 assert(result != -ESRCH);
1174 return result;
1175 }
1176
1177 r = pidref_verify(pidref);
1178 if (r == -ESRCH)
1179 return false;
1180 if (r < 0)
1181 return r;
1182
1183 return result;
1184 }
1185
1186 int pid_from_same_root_fs(pid_t pid) {
1187 const char *root;
1188
1189 if (pid < 0)
1190 return false;
1191
1192 if (pid == 0 || pid == getpid_cached())
1193 return true;
1194
1195 root = procfs_file_alloca(pid, "root");
1196
1197 return inode_same(root, "/proc/1/root", 0);
1198 }
1199
1200 bool is_main_thread(void) {
1201 static thread_local int cached = 0;
1202
1203 if (_unlikely_(cached == 0))
1204 cached = getpid_cached() == gettid() ? 1 : -1;
1205
1206 return cached > 0;
1207 }
1208
1209 bool oom_score_adjust_is_valid(int oa) {
1210 return oa >= OOM_SCORE_ADJ_MIN && oa <= OOM_SCORE_ADJ_MAX;
1211 }
1212
1213 unsigned long personality_from_string(const char *p) {
1214 Architecture architecture;
1215
1216 if (!p)
1217 return PERSONALITY_INVALID;
1218
1219 /* Parse a personality specifier. We use our own identifiers that indicate specific ABIs, rather than just
1220 * hints regarding the register size, since we want to keep things open for multiple locally supported ABIs for
1221 * the same register size. */
1222
1223 architecture = architecture_from_string(p);
1224 if (architecture < 0)
1225 return PERSONALITY_INVALID;
1226
1227 if (architecture == native_architecture())
1228 return PER_LINUX;
1229 #ifdef ARCHITECTURE_SECONDARY
1230 if (architecture == ARCHITECTURE_SECONDARY)
1231 return PER_LINUX32;
1232 #endif
1233
1234 return PERSONALITY_INVALID;
1235 }
1236
1237 const char* personality_to_string(unsigned long p) {
1238 Architecture architecture = _ARCHITECTURE_INVALID;
1239
1240 if (p == PER_LINUX)
1241 architecture = native_architecture();
1242 #ifdef ARCHITECTURE_SECONDARY
1243 else if (p == PER_LINUX32)
1244 architecture = ARCHITECTURE_SECONDARY;
1245 #endif
1246
1247 if (architecture < 0)
1248 return NULL;
1249
1250 return architecture_to_string(architecture);
1251 }
1252
1253 int safe_personality(unsigned long p) {
1254 int ret;
1255
1256 /* So here's the deal, personality() is weirdly defined by glibc. In some cases it returns a failure via errno,
1257 * and in others as negative return value containing an errno-like value. Let's work around this: this is a
1258 * wrapper that uses errno if it is set, and uses the return value otherwise. And then it sets both errno and
1259 * the return value indicating the same issue, so that we are definitely on the safe side.
1260 *
1261 * See https://github.com/systemd/systemd/issues/6737 */
1262
1263 errno = 0;
1264 ret = personality(p);
1265 if (ret < 0) {
1266 if (errno != 0)
1267 return -errno;
1268
1269 errno = -ret;
1270 }
1271
1272 return ret;
1273 }
1274
1275 int opinionated_personality(unsigned long *ret) {
1276 int current;
1277
1278 /* Returns the current personality, or PERSONALITY_INVALID if we can't determine it. This function is a bit
1279 * opinionated though, and ignores all the finer-grained bits and exotic personalities, only distinguishing the
1280 * two most relevant personalities: PER_LINUX and PER_LINUX32. */
1281
1282 current = safe_personality(PERSONALITY_INVALID);
1283 if (current < 0)
1284 return current;
1285
1286 if (((unsigned long) current & OPINIONATED_PERSONALITY_MASK) == PER_LINUX32)
1287 *ret = PER_LINUX32;
1288 else
1289 *ret = PER_LINUX;
1290
1291 return 0;
1292 }
1293
1294 void valgrind_summary_hack(void) {
1295 #if HAVE_VALGRIND_VALGRIND_H
1296 if (getpid_cached() == 1 && RUNNING_ON_VALGRIND) {
1297 pid_t pid;
1298 pid = raw_clone(SIGCHLD);
1299 if (pid < 0)
1300 log_struct_errno(
1301 LOG_EMERG, errno,
1302 "MESSAGE_ID=" SD_MESSAGE_VALGRIND_HELPER_FORK_STR,
1303 LOG_MESSAGE( "Failed to fork off valgrind helper: %m"));
1304 else if (pid == 0)
1305 exit(EXIT_SUCCESS);
1306 else {
1307 log_info("Spawned valgrind helper as PID "PID_FMT".", pid);
1308 (void) wait_for_terminate(pid, NULL);
1309 }
1310 }
1311 #endif
1312 }
1313
1314 int pid_compare_func(const pid_t *a, const pid_t *b) {
1315 /* Suitable for usage in qsort() */
1316 return CMP(*a, *b);
1317 }
1318
1319 /* The cached PID, possible values:
1320 *
1321 * == UNSET [0] → cache not initialized yet
1322 * == BUSY [-1] → some thread is initializing it at the moment
1323 * any other → the cached PID
1324 */
1325
1326 #define CACHED_PID_UNSET ((pid_t) 0)
1327 #define CACHED_PID_BUSY ((pid_t) -1)
1328
1329 static pid_t cached_pid = CACHED_PID_UNSET;
1330
1331 void reset_cached_pid(void) {
1332 /* Invoked in the child after a fork(), i.e. at the first moment the PID changed */
1333 cached_pid = CACHED_PID_UNSET;
1334 }
1335
1336 pid_t getpid_cached(void) {
1337 static bool installed = false;
1338 pid_t current_value = CACHED_PID_UNSET;
1339
1340 /* getpid_cached() is much like getpid(), but caches the value in local memory, to avoid having to invoke a
1341 * system call each time. This restores glibc behaviour from before 2.24, when getpid() was unconditionally
1342 * cached. Starting with 2.24 getpid() started to become prohibitively expensive when used for detecting when
1343 * objects were used across fork()s. With this caching the old behaviour is somewhat restored.
1344 *
1345 * https://bugzilla.redhat.com/show_bug.cgi?id=1443976
1346 * https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=c579f48edba88380635ab98cb612030e3ed8691e
1347 */
1348
1349 (void) __atomic_compare_exchange_n(
1350 &cached_pid,
1351 &current_value,
1352 CACHED_PID_BUSY,
1353 false,
1354 __ATOMIC_SEQ_CST,
1355 __ATOMIC_SEQ_CST);
1356
1357 switch (current_value) {
1358
1359 case CACHED_PID_UNSET: { /* Not initialized yet, then do so now */
1360 pid_t new_pid;
1361
1362 new_pid = raw_getpid();
1363
1364 if (!installed) {
1365 /* __register_atfork() either returns 0 or -ENOMEM, in its glibc implementation. Since it's
1366 * only half-documented (glibc doesn't document it but LSB does — though only superficially)
1367 * we'll check for errors only in the most generic fashion possible. */
1368
1369 if (pthread_atfork(NULL, NULL, reset_cached_pid) != 0) {
1370 /* OOM? Let's try again later */
1371 cached_pid = CACHED_PID_UNSET;
1372 return new_pid;
1373 }
1374
1375 installed = true;
1376 }
1377
1378 cached_pid = new_pid;
1379 return new_pid;
1380 }
1381
1382 case CACHED_PID_BUSY: /* Somebody else is currently initializing */
1383 return raw_getpid();
1384
1385 default: /* Properly initialized */
1386 return current_value;
1387 }
1388 }
1389
1390 int must_be_root(void) {
1391
1392 if (geteuid() == 0)
1393 return 0;
1394
1395 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Need to be root.");
1396 }
1397
1398 static void restore_sigsetp(sigset_t **ssp) {
1399 if (*ssp)
1400 (void) sigprocmask(SIG_SETMASK, *ssp, NULL);
1401 }
1402
1403 pid_t clone_with_nested_stack(int (*fn)(void *), int flags, void *userdata) {
1404 size_t ps;
1405 pid_t pid;
1406 void *mystack;
1407
1408 /* A wrapper around glibc's clone() call that automatically sets up a "nested" stack. Only supports
1409 * invocations without CLONE_VM, so that we can continue to use the parent's stack mapping.
1410 *
1411 * Note: glibc's clone() wrapper does not synchronize malloc() locks. This means that if the parent
1412 * is threaded these locks will be in an undefined state in the child, and hence memory allocations
1413 * are likely going to run into deadlocks. Hence: if you use this function make sure your parent is
1414 * strictly single-threaded or your child never calls malloc(). */
1415
1416 assert((flags & (CLONE_VM|CLONE_PARENT_SETTID|CLONE_CHILD_SETTID|
1417 CLONE_CHILD_CLEARTID|CLONE_SETTLS)) == 0);
1418
1419 /* We allocate some space on the stack to use as the stack for the child (hence "nested"). Note that
1420 * the net effect is that the child will have the start of its stack inside the stack of the parent,
1421 * but since they are a CoW copy of each other that's fine. We allocate one page-aligned page. But
1422 * since we don't want to deal with differences between systems where the stack grows backwards or
1423 * forwards we'll allocate one more and place the stack address in the middle. Except that we also
1424 * want it page aligned, hence we'll allocate one page more. Makes 3. */
1425
1426 ps = page_size();
1427 mystack = alloca(ps*3);
1428 mystack = (uint8_t*) mystack + ps; /* move pointer one page ahead since stacks usually grow backwards */
1429 mystack = (void*) ALIGN_TO((uintptr_t) mystack, ps); /* align to page size (moving things further ahead) */
1430
1431 #if HAVE_CLONE
1432 pid = clone(fn, mystack, flags, userdata);
1433 #else
1434 pid = __clone2(fn, mystack, ps, flags, userdata);
1435 #endif
1436 if (pid < 0)
1437 return -errno;
1438
1439 return pid;
1440 }
1441
1442 static int fork_flags_to_signal(ForkFlags flags) {
1443 return (flags & FORK_DEATHSIG_SIGTERM) ? SIGTERM :
1444 (flags & FORK_DEATHSIG_SIGINT) ? SIGINT :
1445 SIGKILL;
1446 }
1447
1448 int safe_fork_full(
1449 const char *name,
1450 const int stdio_fds[3],
1451 int except_fds[],
1452 size_t n_except_fds,
1453 ForkFlags flags,
1454 pid_t *ret_pid) {
1455
1456 pid_t original_pid, pid;
1457 sigset_t saved_ss, ss;
1458 _unused_ _cleanup_(restore_sigsetp) sigset_t *saved_ssp = NULL;
1459 bool block_signals = false, block_all = false, intermediary = false;
1460 int prio, r;
1461
1462 assert(!FLAGS_SET(flags, FORK_DETACH) || !ret_pid);
1463 assert(!FLAGS_SET(flags, FORK_DETACH|FORK_WAIT));
1464
1465 /* A wrapper around fork(), that does a couple of important initializations in addition to mere forking. Always
1466 * returns the child's PID in *ret_pid. Returns == 0 in the child, and > 0 in the parent. */
1467
1468 prio = flags & FORK_LOG ? LOG_ERR : LOG_DEBUG;
1469
1470 original_pid = getpid_cached();
1471
1472 if (flags & FORK_FLUSH_STDIO) {
1473 fflush(stdout);
1474 fflush(stderr); /* This one shouldn't be necessary, stderr should be unbuffered anyway, but let's better be safe than sorry */
1475 }
1476
1477 if (flags & (FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|FORK_DEATHSIG_SIGINT)) {
1478 /* We temporarily block all signals, so that the new child has them blocked initially. This
1479 * way, we can be sure that SIGTERMs are not lost we might send to the child. (Note that for
1480 * FORK_DEATHSIG_SIGKILL we don't bother, since it cannot be blocked anyway.) */
1481
1482 assert_se(sigfillset(&ss) >= 0);
1483 block_signals = block_all = true;
1484
1485 } else if (flags & FORK_WAIT) {
1486 /* Let's block SIGCHLD at least, so that we can safely watch for the child process */
1487
1488 assert_se(sigemptyset(&ss) >= 0);
1489 assert_se(sigaddset(&ss, SIGCHLD) >= 0);
1490 block_signals = true;
1491 }
1492
1493 if (block_signals) {
1494 if (sigprocmask(SIG_SETMASK, &ss, &saved_ss) < 0)
1495 return log_full_errno(prio, errno, "Failed to set signal mask: %m");
1496 saved_ssp = &saved_ss;
1497 }
1498
1499 if (FLAGS_SET(flags, FORK_DETACH)) {
1500 assert(!FLAGS_SET(flags, FORK_WAIT));
1501 assert(!ret_pid);
1502
1503 /* Fork off intermediary child if needed */
1504
1505 r = is_reaper_process();
1506 if (r < 0)
1507 return log_full_errno(prio, r, "Failed to determine if we are a reaper process: %m");
1508
1509 if (!r) {
1510 /* Not a reaper process, hence do a double fork() so we are reparented to one */
1511
1512 pid = fork();
1513 if (pid < 0)
1514 return log_full_errno(prio, errno, "Failed to fork off '%s': %m", strna(name));
1515 if (pid > 0) {
1516 log_debug("Successfully forked off intermediary '%s' as PID " PID_FMT ".", strna(name), pid);
1517 return 1; /* return in the parent */
1518 }
1519
1520 intermediary = true;
1521 }
1522 }
1523
1524 if ((flags & (FORK_NEW_MOUNTNS|FORK_NEW_USERNS|FORK_NEW_NETNS)) != 0)
1525 pid = raw_clone(SIGCHLD|
1526 (FLAGS_SET(flags, FORK_NEW_MOUNTNS) ? CLONE_NEWNS : 0) |
1527 (FLAGS_SET(flags, FORK_NEW_USERNS) ? CLONE_NEWUSER : 0) |
1528 (FLAGS_SET(flags, FORK_NEW_NETNS) ? CLONE_NEWNET : 0));
1529 else
1530 pid = fork();
1531 if (pid < 0)
1532 return log_full_errno(prio, errno, "Failed to fork off '%s': %m", strna(name));
1533 if (pid > 0) {
1534
1535 /* If we are in the intermediary process, exit now */
1536 if (intermediary)
1537 _exit(EXIT_SUCCESS);
1538
1539 /* We are in the parent process */
1540 log_debug("Successfully forked off '%s' as PID " PID_FMT ".", strna(name), pid);
1541
1542 if (flags & FORK_WAIT) {
1543 if (block_all) {
1544 /* undo everything except SIGCHLD */
1545 ss = saved_ss;
1546 assert_se(sigaddset(&ss, SIGCHLD) >= 0);
1547 (void) sigprocmask(SIG_SETMASK, &ss, NULL);
1548 }
1549
1550 r = wait_for_terminate_and_check(name, pid, (flags & FORK_LOG ? WAIT_LOG : 0));
1551 if (r < 0)
1552 return r;
1553 if (r != EXIT_SUCCESS) /* exit status > 0 should be treated as failure, too */
1554 return -EPROTO;
1555 }
1556
1557 if (ret_pid)
1558 *ret_pid = pid;
1559
1560 return 1;
1561 }
1562
1563 /* We are in the child process */
1564
1565 /* Restore signal mask manually */
1566 saved_ssp = NULL;
1567
1568 if (flags & FORK_REOPEN_LOG) {
1569 /* Close the logs if requested, before we log anything. And make sure we reopen it if needed. */
1570 log_close();
1571 log_set_open_when_needed(true);
1572 log_settle_target();
1573 }
1574
1575 if (name) {
1576 r = rename_process(name);
1577 if (r < 0)
1578 log_full_errno(flags & FORK_LOG ? LOG_WARNING : LOG_DEBUG,
1579 r, "Failed to rename process, ignoring: %m");
1580 }
1581
1582 if (flags & (FORK_DEATHSIG_SIGTERM|FORK_DEATHSIG_SIGINT|FORK_DEATHSIG_SIGKILL))
1583 if (prctl(PR_SET_PDEATHSIG, fork_flags_to_signal(flags)) < 0) {
1584 log_full_errno(prio, errno, "Failed to set death signal: %m");
1585 _exit(EXIT_FAILURE);
1586 }
1587
1588 if (flags & FORK_RESET_SIGNALS) {
1589 r = reset_all_signal_handlers();
1590 if (r < 0) {
1591 log_full_errno(prio, r, "Failed to reset signal handlers: %m");
1592 _exit(EXIT_FAILURE);
1593 }
1594
1595 /* This implicitly undoes the signal mask stuff we did before the fork()ing above */
1596 r = reset_signal_mask();
1597 if (r < 0) {
1598 log_full_errno(prio, r, "Failed to reset signal mask: %m");
1599 _exit(EXIT_FAILURE);
1600 }
1601 } else if (block_signals) { /* undo what we did above */
1602 if (sigprocmask(SIG_SETMASK, &saved_ss, NULL) < 0) {
1603 log_full_errno(prio, errno, "Failed to restore signal mask: %m");
1604 _exit(EXIT_FAILURE);
1605 }
1606 }
1607
1608 if (flags & (FORK_DEATHSIG_SIGTERM|FORK_DEATHSIG_SIGKILL|FORK_DEATHSIG_SIGINT)) {
1609 pid_t ppid;
1610 /* Let's see if the parent PID is still the one we started from? If not, then the parent
1611 * already died by the time we set PR_SET_PDEATHSIG, hence let's emulate the effect */
1612
1613 ppid = getppid();
1614 if (ppid == 0)
1615 /* Parent is in a different PID namespace. */;
1616 else if (ppid != original_pid) {
1617 int sig = fork_flags_to_signal(flags);
1618 log_debug("Parent died early, raising %s.", signal_to_string(sig));
1619 (void) raise(sig);
1620 _exit(EXIT_FAILURE);
1621 }
1622 }
1623
1624 if (FLAGS_SET(flags, FORK_NEW_MOUNTNS | FORK_MOUNTNS_SLAVE)) {
1625 /* Optionally, make sure we never propagate mounts to the host. */
1626 if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) < 0) {
1627 log_full_errno(prio, errno, "Failed to remount root directory as MS_SLAVE: %m");
1628 _exit(EXIT_FAILURE);
1629 }
1630 }
1631
1632 if (FLAGS_SET(flags, FORK_PRIVATE_TMP)) {
1633 assert(FLAGS_SET(flags, FORK_NEW_MOUNTNS));
1634
1635 /* Optionally, overmount new tmpfs instance on /tmp/. */
1636 r = mount_nofollow("tmpfs", "/tmp", "tmpfs",
1637 MS_NOSUID|MS_NODEV,
1638 "mode=01777" TMPFS_LIMITS_RUN);
1639 if (r < 0) {
1640 log_full_errno(prio, r, "Failed to overmount /tmp/: %m");
1641 _exit(EXIT_FAILURE);
1642 }
1643 }
1644
1645 if (flags & FORK_REARRANGE_STDIO) {
1646 if (stdio_fds) {
1647 r = rearrange_stdio(stdio_fds[0], stdio_fds[1], stdio_fds[2]);
1648 if (r < 0) {
1649 log_full_errno(prio, r, "Failed to rearrange stdio fds: %m");
1650 _exit(EXIT_FAILURE);
1651 }
1652
1653 /* Turn off O_NONBLOCK on the fdio fds, in case it was left on */
1654 stdio_disable_nonblock();
1655 } else {
1656 r = make_null_stdio();
1657 if (r < 0) {
1658 log_full_errno(prio, r, "Failed to connect stdin/stdout to /dev/null: %m");
1659 _exit(EXIT_FAILURE);
1660 }
1661 }
1662 } else if (flags & FORK_STDOUT_TO_STDERR) {
1663 if (dup2(STDERR_FILENO, STDOUT_FILENO) < 0) {
1664 log_full_errno(prio, errno, "Failed to connect stdout to stderr: %m");
1665 _exit(EXIT_FAILURE);
1666 }
1667 }
1668
1669 if (flags & FORK_CLOSE_ALL_FDS) {
1670 /* Close the logs here in case it got reopened above, as close_all_fds() would close them for us */
1671 log_close();
1672
1673 r = close_all_fds(except_fds, n_except_fds);
1674 if (r < 0) {
1675 log_full_errno(prio, r, "Failed to close all file descriptors: %m");
1676 _exit(EXIT_FAILURE);
1677 }
1678 }
1679
1680 if (flags & FORK_PACK_FDS) {
1681 /* FORK_CLOSE_ALL_FDS ensures that except_fds are the only FDs >= 3 that are
1682 * open, this is including the log. This is required by pack_fds, which will
1683 * get stuck in an infinite loop of any FDs other than except_fds are open. */
1684 assert(FLAGS_SET(flags, FORK_CLOSE_ALL_FDS));
1685
1686 r = pack_fds(except_fds, n_except_fds);
1687 if (r < 0) {
1688 log_full_errno(prio, r, "Failed to pack file descriptors: %m");
1689 _exit(EXIT_FAILURE);
1690 }
1691 }
1692
1693 if (flags & FORK_CLOEXEC_OFF) {
1694 r = fd_cloexec_many(except_fds, n_except_fds, false);
1695 if (r < 0) {
1696 log_full_errno(prio, r, "Failed to turn off O_CLOEXEC on file descriptors: %m");
1697 _exit(EXIT_FAILURE);
1698 }
1699 }
1700
1701 /* When we were asked to reopen the logs, do so again now */
1702 if (flags & FORK_REOPEN_LOG) {
1703 log_open();
1704 log_set_open_when_needed(false);
1705 }
1706
1707 if (flags & FORK_RLIMIT_NOFILE_SAFE) {
1708 r = rlimit_nofile_safe();
1709 if (r < 0) {
1710 log_full_errno(prio, r, "Failed to lower RLIMIT_NOFILE's soft limit to 1K: %m");
1711 _exit(EXIT_FAILURE);
1712 }
1713 }
1714
1715 if (!FLAGS_SET(flags, FORK_KEEP_NOTIFY_SOCKET)) {
1716 r = RET_NERRNO(unsetenv("NOTIFY_SOCKET"));
1717 if (r < 0) {
1718 log_full_errno(prio, r, "Failed to unset $NOTIFY_SOCKET: %m");
1719 _exit(EXIT_FAILURE);
1720 }
1721 }
1722
1723 if (ret_pid)
1724 *ret_pid = getpid_cached();
1725
1726 return 0;
1727 }
1728
1729 int pidref_safe_fork_full(
1730 const char *name,
1731 const int stdio_fds[3],
1732 int except_fds[],
1733 size_t n_except_fds,
1734 ForkFlags flags,
1735 PidRef *ret_pid) {
1736
1737 pid_t pid;
1738 int r, q;
1739
1740 assert(!FLAGS_SET(flags, FORK_WAIT));
1741
1742 r = safe_fork_full(name, stdio_fds, except_fds, n_except_fds, flags, &pid);
1743 if (r < 0)
1744 return r;
1745
1746 q = pidref_set_pid(ret_pid, pid);
1747 if (q < 0) /* Let's not fail for this, no matter what, the process exists after all, and that's key */
1748 *ret_pid = PIDREF_MAKE_FROM_PID(pid);
1749
1750 return r;
1751 }
1752
1753 int namespace_fork(
1754 const char *outer_name,
1755 const char *inner_name,
1756 int except_fds[],
1757 size_t n_except_fds,
1758 ForkFlags flags,
1759 int pidns_fd,
1760 int mntns_fd,
1761 int netns_fd,
1762 int userns_fd,
1763 int root_fd,
1764 pid_t *ret_pid) {
1765
1766 int r;
1767
1768 /* This is much like safe_fork(), but forks twice, and joins the specified namespaces in the middle
1769 * process. This ensures that we are fully a member of the destination namespace, with pidns an all, so that
1770 * /proc/self/fd works correctly. */
1771
1772 r = safe_fork_full(outer_name,
1773 NULL,
1774 except_fds, n_except_fds,
1775 (flags|FORK_DEATHSIG_SIGINT|FORK_DEATHSIG_SIGTERM|FORK_DEATHSIG_SIGKILL) & ~(FORK_REOPEN_LOG|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE), ret_pid);
1776 if (r < 0)
1777 return r;
1778 if (r == 0) {
1779 pid_t pid;
1780
1781 /* Child */
1782
1783 r = namespace_enter(pidns_fd, mntns_fd, netns_fd, userns_fd, root_fd);
1784 if (r < 0) {
1785 log_full_errno(FLAGS_SET(flags, FORK_LOG) ? LOG_ERR : LOG_DEBUG, r, "Failed to join namespace: %m");
1786 _exit(EXIT_FAILURE);
1787 }
1788
1789 /* We mask a few flags here that either make no sense for the grandchild, or that we don't have to do again */
1790 r = safe_fork_full(inner_name,
1791 NULL,
1792 except_fds, n_except_fds,
1793 flags & ~(FORK_WAIT|FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_REARRANGE_STDIO), &pid);
1794 if (r < 0)
1795 _exit(EXIT_FAILURE);
1796 if (r == 0) {
1797 /* Child */
1798 if (ret_pid)
1799 *ret_pid = pid;
1800 return 0;
1801 }
1802
1803 r = wait_for_terminate_and_check(inner_name, pid, FLAGS_SET(flags, FORK_LOG) ? WAIT_LOG : 0);
1804 if (r < 0)
1805 _exit(EXIT_FAILURE);
1806
1807 _exit(r);
1808 }
1809
1810 return 1;
1811 }
1812
1813 int set_oom_score_adjust(int value) {
1814 char t[DECIMAL_STR_MAX(int)];
1815
1816 xsprintf(t, "%i", value);
1817
1818 return write_string_file("/proc/self/oom_score_adj", t,
1819 WRITE_STRING_FILE_VERIFY_ON_FAILURE|WRITE_STRING_FILE_DISABLE_BUFFER);
1820 }
1821
1822 int get_oom_score_adjust(int *ret) {
1823 _cleanup_free_ char *t = NULL;
1824 int r, a;
1825
1826 r = read_virtual_file("/proc/self/oom_score_adj", SIZE_MAX, &t, NULL);
1827 if (r < 0)
1828 return r;
1829
1830 delete_trailing_chars(t, WHITESPACE);
1831
1832 assert_se(safe_atoi(t, &a) >= 0);
1833 assert_se(oom_score_adjust_is_valid(a));
1834
1835 if (ret)
1836 *ret = a;
1837 return 0;
1838 }
1839
1840 int pidfd_get_pid(int fd, pid_t *ret) {
1841 char path[STRLEN("/proc/self/fdinfo/") + DECIMAL_STR_MAX(int)];
1842 _cleanup_free_ char *fdinfo = NULL;
1843 char *p;
1844 int r;
1845
1846 /* Converts a pidfd into a pid. Well known errors:
1847 *
1848 * -EBADF → fd invalid
1849 * -ENOSYS → /proc/ not mounted
1850 * -ENOTTY → fd valid, but not a pidfd
1851 * -EREMOTE → fd valid, but pid is in another namespace we cannot translate to the local one
1852 * -ESRCH → fd valid, but process is already reaped
1853 */
1854
1855 if (fd < 0)
1856 return -EBADF;
1857
1858 xsprintf(path, "/proc/self/fdinfo/%i", fd);
1859
1860 r = read_full_virtual_file(path, &fdinfo, NULL);
1861 if (r == -ENOENT) /* if fdinfo doesn't exist we assume the process does not exist */
1862 return proc_mounted() > 0 ? -EBADF : -ENOSYS;
1863 if (r < 0)
1864 return r;
1865
1866 p = find_line_startswith(fdinfo, "Pid:");
1867 if (!p)
1868 return -ENOTTY; /* not a pidfd? */
1869
1870 p += strspn(p, WHITESPACE);
1871 p[strcspn(p, WHITESPACE)] = 0;
1872
1873 if (streq(p, "0"))
1874 return -EREMOTE; /* PID is in foreign PID namespace? */
1875 if (streq(p, "-1"))
1876 return -ESRCH; /* refers to reaped process? */
1877
1878 return parse_pid(p, ret);
1879 }
1880
1881 int pidfd_verify_pid(int pidfd, pid_t pid) {
1882 pid_t current_pid;
1883 int r;
1884
1885 assert(pidfd >= 0);
1886 assert(pid > 0);
1887
1888 r = pidfd_get_pid(pidfd, &current_pid);
1889 if (r < 0)
1890 return r;
1891
1892 return current_pid != pid ? -ESRCH : 0;
1893 }
1894
1895 static int rlimit_to_nice(rlim_t limit) {
1896 if (limit <= 1)
1897 return PRIO_MAX-1; /* i.e. 19 */
1898
1899 if (limit >= -PRIO_MIN + PRIO_MAX)
1900 return PRIO_MIN; /* i.e. -20 */
1901
1902 return PRIO_MAX - (int) limit;
1903 }
1904
1905 int setpriority_closest(int priority) {
1906 int current, limit, saved_errno;
1907 struct rlimit highest;
1908
1909 /* Try to set requested nice level */
1910 if (setpriority(PRIO_PROCESS, 0, priority) >= 0)
1911 return 1;
1912
1913 /* Permission failed */
1914 saved_errno = -errno;
1915 if (!ERRNO_IS_PRIVILEGE(saved_errno))
1916 return saved_errno;
1917
1918 errno = 0;
1919 current = getpriority(PRIO_PROCESS, 0);
1920 if (errno != 0)
1921 return -errno;
1922
1923 if (priority == current)
1924 return 1;
1925
1926 /* Hmm, we'd expect that raising the nice level from our status quo would always work. If it doesn't,
1927 * then the whole setpriority() system call is blocked to us, hence let's propagate the error
1928 * right-away */
1929 if (priority > current)
1930 return saved_errno;
1931
1932 if (getrlimit(RLIMIT_NICE, &highest) < 0)
1933 return -errno;
1934
1935 limit = rlimit_to_nice(highest.rlim_cur);
1936
1937 /* We are already less nice than limit allows us */
1938 if (current < limit) {
1939 log_debug("Cannot raise nice level, permissions and the resource limit do not allow it.");
1940 return 0;
1941 }
1942
1943 /* Push to the allowed limit */
1944 if (setpriority(PRIO_PROCESS, 0, limit) < 0)
1945 return -errno;
1946
1947 log_debug("Cannot set requested nice level (%i), used next best (%i).", priority, limit);
1948 return 0;
1949 }
1950
1951 _noreturn_ void freeze(void) {
1952 log_close();
1953
1954 /* Make sure nobody waits for us (i.e. on one of our sockets) anymore. Note that we use
1955 * close_all_fds_without_malloc() instead of plain close_all_fds() here, since we want this function
1956 * to be compatible with being called from signal handlers. */
1957 (void) close_all_fds_without_malloc(NULL, 0);
1958
1959 /* Let's not freeze right away, but keep reaping zombies. */
1960 for (;;) {
1961 siginfo_t si = {};
1962
1963 if (waitid(P_ALL, 0, &si, WEXITED) < 0 && errno != EINTR)
1964 break;
1965 }
1966
1967 /* waitid() failed with an unexpected error, things are really borked. Freeze now! */
1968 for (;;)
1969 pause();
1970 }
1971
1972 int get_process_threads(pid_t pid) {
1973 _cleanup_free_ char *t = NULL;
1974 const char *p;
1975 int n, r;
1976
1977 if (pid < 0)
1978 return -EINVAL;
1979
1980 p = procfs_file_alloca(pid, "status");
1981
1982 r = get_proc_field(p, "Threads", WHITESPACE, &t);
1983 if (r == -ENOENT)
1984 return proc_mounted() == 0 ? -ENOSYS : -ESRCH;
1985 if (r < 0)
1986 return r;
1987
1988 r = safe_atoi(t, &n);
1989 if (r < 0)
1990 return r;
1991 if (n < 0)
1992 return -EINVAL;
1993
1994 return n;
1995 }
1996
1997 int is_reaper_process(void) {
1998 int b = 0;
1999
2000 /* Checks if we are running in a reaper process, i.e. if we are expected to deal with processes
2001 * reparented to us. This simply checks if we are PID 1 or if PR_SET_CHILD_SUBREAPER was called. */
2002
2003 if (getpid_cached() == 1)
2004 return true;
2005
2006 if (prctl(PR_GET_CHILD_SUBREAPER, (unsigned long) &b, 0UL, 0UL, 0UL) < 0)
2007 return -errno;
2008
2009 return b != 0;
2010 }
2011
2012 int make_reaper_process(bool b) {
2013
2014 if (getpid_cached() == 1) {
2015
2016 if (!b)
2017 return -EINVAL;
2018
2019 return 0;
2020 }
2021
2022 /* Some prctl()s insist that all 5 arguments are specified, others do not. Let's always specify all,
2023 * to avoid any ambiguities */
2024 if (prctl(PR_SET_CHILD_SUBREAPER, (unsigned long) b, 0UL, 0UL, 0UL) < 0)
2025 return -errno;
2026
2027 return 0;
2028 }
2029
2030 DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(posix_spawnattr_t*, posix_spawnattr_destroy, NULL);
2031
2032 int posix_spawn_wrapper(
2033 const char *path,
2034 char * const *argv,
2035 char * const *envp,
2036 const char *cgroup,
2037 PidRef *ret_pidref) {
2038
2039 short flags = POSIX_SPAWN_SETSIGMASK|POSIX_SPAWN_SETSIGDEF;
2040 posix_spawnattr_t attr;
2041 sigset_t mask;
2042 int r;
2043
2044 /* Forks and invokes 'path' with 'argv' and 'envp' using CLONE_VM and CLONE_VFORK, which means the
2045 * caller will be blocked until the child either exits or exec's. The memory of the child will be
2046 * fully shared with the memory of the parent, so that there are no copy-on-write or memory.max
2047 * issues.
2048 *
2049 * Also, move the newly-created process into 'cgroup' through POSIX_SPAWN_SETCGROUP (clone3())
2050 * if available. Note that CLONE_INTO_CGROUP is only supported on cgroup v2.
2051 * returns 1: We're already in the right cgroup
2052 * 0: 'cgroup' not specified or POSIX_SPAWN_SETCGROUP is not supported. The caller
2053 * needs to call 'cg_attach' on their own */
2054
2055 assert(path);
2056 assert(argv);
2057 assert(ret_pidref);
2058
2059 assert_se(sigfillset(&mask) >= 0);
2060
2061 r = posix_spawnattr_init(&attr);
2062 if (r != 0)
2063 return -r; /* These functions return a positive errno on failure */
2064
2065 /* Initialization needs to succeed before we can set up a destructor. */
2066 _unused_ _cleanup_(posix_spawnattr_destroyp) posix_spawnattr_t *attr_destructor = &attr;
2067
2068 #if HAVE_PIDFD_SPAWN
2069 _cleanup_close_ int cgroup_fd = -EBADF;
2070
2071 if (cgroup) {
2072 _cleanup_free_ char *resolved_cgroup = NULL;
2073
2074 r = cg_get_path_and_check(
2075 SYSTEMD_CGROUP_CONTROLLER,
2076 cgroup,
2077 /* suffix= */ NULL,
2078 &resolved_cgroup);
2079 if (r < 0)
2080 return r;
2081
2082 cgroup_fd = open(resolved_cgroup, O_PATH|O_DIRECTORY|O_CLOEXEC);
2083 if (cgroup_fd < 0)
2084 return -errno;
2085
2086 r = posix_spawnattr_setcgroup_np(&attr, cgroup_fd);
2087 if (r != 0)
2088 return -r;
2089
2090 flags |= POSIX_SPAWN_SETCGROUP;
2091 }
2092 #endif
2093
2094 r = posix_spawnattr_setflags(&attr, flags);
2095 if (r != 0)
2096 return -r;
2097 r = posix_spawnattr_setsigmask(&attr, &mask);
2098 if (r != 0)
2099 return -r;
2100
2101 #if HAVE_PIDFD_SPAWN
2102 _cleanup_close_ int pidfd = -EBADF;
2103
2104 r = pidfd_spawn(&pidfd, path, NULL, &attr, argv, envp);
2105 if (r == 0) {
2106 r = pidref_set_pidfd_consume(ret_pidref, TAKE_FD(pidfd));
2107 if (r < 0)
2108 return r;
2109
2110 return FLAGS_SET(flags, POSIX_SPAWN_SETCGROUP);
2111 }
2112 if (ERRNO_IS_NOT_SUPPORTED(r)) {
2113 /* clone3() could also return EOPNOTSUPP if the target cgroup is in threaded mode. */
2114 if (cgroup && cg_is_threaded(cgroup) > 0)
2115 return -EUCLEAN;
2116
2117 /* clone3() not available? */
2118 } else if (!ERRNO_IS_PRIVILEGE(r))
2119 return -r;
2120
2121 /* Compiled on a newer host, or seccomp&friends blocking clone3()? Fallback, but need to change the
2122 * flags to remove the cgroup one, which is what redirects to clone3() */
2123 flags &= ~POSIX_SPAWN_SETCGROUP;
2124 r = posix_spawnattr_setflags(&attr, flags);
2125 if (r != 0)
2126 return -r;
2127 #endif
2128
2129 pid_t pid;
2130 r = posix_spawn(&pid, path, NULL, &attr, argv, envp);
2131 if (r != 0)
2132 return -r;
2133
2134 r = pidref_set_pid(ret_pidref, pid);
2135 if (r < 0)
2136 return r;
2137
2138 return 0; /* We did not use CLONE_INTO_CGROUP so return 0, the caller will have to move the child */
2139 }
2140
2141 int proc_dir_open(DIR **ret) {
2142 DIR *d;
2143
2144 assert(ret);
2145
2146 d = opendir("/proc");
2147 if (!d)
2148 return -errno;
2149
2150 *ret = d;
2151 return 0;
2152 }
2153
2154 int proc_dir_read(DIR *d, pid_t *ret) {
2155 assert(d);
2156
2157 for (;;) {
2158 struct dirent *de;
2159
2160 errno = 0;
2161 de = readdir_no_dot(d);
2162 if (!de) {
2163 if (errno != 0)
2164 return -errno;
2165
2166 break;
2167 }
2168
2169 if (!IN_SET(de->d_type, DT_DIR, DT_UNKNOWN))
2170 continue;
2171
2172 if (parse_pid(de->d_name, ret) >= 0)
2173 return 1;
2174 }
2175
2176 if (ret)
2177 *ret = 0;
2178 return 0;
2179 }
2180
2181 int proc_dir_read_pidref(DIR *d, PidRef *ret) {
2182 int r;
2183
2184 assert(d);
2185
2186 for (;;) {
2187 pid_t pid;
2188
2189 r = proc_dir_read(d, &pid);
2190 if (r < 0)
2191 return r;
2192 if (r == 0)
2193 break;
2194
2195 r = pidref_set_pid(ret, pid);
2196 if (r == -ESRCH) /* gone by now? skip it */
2197 continue;
2198 if (r < 0)
2199 return r;
2200
2201 return 1;
2202 }
2203
2204 if (ret)
2205 *ret = PIDREF_NULL;
2206 return 0;
2207 }
2208
2209 static const char *const sigchld_code_table[] = {
2210 [CLD_EXITED] = "exited",
2211 [CLD_KILLED] = "killed",
2212 [CLD_DUMPED] = "dumped",
2213 [CLD_TRAPPED] = "trapped",
2214 [CLD_STOPPED] = "stopped",
2215 [CLD_CONTINUED] = "continued",
2216 };
2217
2218 DEFINE_STRING_TABLE_LOOKUP(sigchld_code, int);
2219
2220 static const char* const sched_policy_table[] = {
2221 [SCHED_OTHER] = "other",
2222 [SCHED_BATCH] = "batch",
2223 [SCHED_IDLE] = "idle",
2224 [SCHED_FIFO] = "fifo",
2225 [SCHED_RR] = "rr",
2226 };
2227
2228 DEFINE_STRING_TABLE_LOOKUP_WITH_FALLBACK(sched_policy, int, INT_MAX);