src/basic/process-util.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <linux/oom.h>
   4 #include <pthread.h>
   5 #include <spawn.h>
   6 #include <stdio.h>
   7 #include <sys/mount.h>
   8 #include <sys/personality.h>
   9 #include <sys/prctl.h>
  10 #include <sys/wait.h>
  11 #include <syslog.h>
  12 #include <threads.h>
  13 #include <unistd.h>
  14 #if HAVE_VALGRIND_VALGRIND_H
  15 #include <valgrind/valgrind.h>
  16 #endif
  17
  18 #include "sd-messages.h"
  19
  20 #include "alloc-util.h"
  21 #include "architecture.h"
  22 #include "argv-util.h"
  23 #include "cgroup-util.h"
  24 #include "dirent-util.h"
  25 #include "env-file.h"
  26 #include "errno-util.h"
  27 #include "escape.h"
  28 #include "fd-util.h"
  29 #include "fileio.h"
  30 #include "fs-util.h"
  31 #include "hostname-util.h"
  32 #include "io-util.h"
  33 #include "iovec-util.h"
  34 #include "locale-util.h"
  35 #include "log.h"
  36 #include "memory-util.h"
  37 #include "missing_syscall.h"
  38 #include "mountpoint-util.h"
  39 #include "namespace-util.h"
  40 #include "nulstr-util.h"
  41 #include "parse-util.h"
  42 #include "path-util.h"
  43 #include "pidfd-util.h"
  44 #include "pidref.h"
  45 #include "process-util.h"
  46 #include "raw-clone.h"
  47 #include "rlimit-util.h"
  48 #include "signal-util.h"
  49 #include "socket-util.h"
  50 #include "stat-util.h"
  51 #include "stdio-util.h"
  52 #include "string-table.h"
  53 #include "string-util.h"
  54 #include "time-util.h"
  55 #include "user-util.h"
  56
  57 /* The kernel limits userspace processes to TASK_COMM_LEN (16 bytes), but allows higher values for its own
  58  * workers, e.g. "kworker/u9:3-kcryptd/253:0". Let's pick a fixed smallish limit that will work for the kernel.
  59  */
  60 #define COMM_MAX_LEN 128
  61
  62 static int get_process_state(pid_t pid) {
  63         _cleanup_free_ char *line = NULL;
  64         const char *p;
  65         char state;
  66         int r;
  67
  68         assert(pid >= 0);
  69
  70         /* Shortcut: if we are enquired about our own state, we are obviously running */
  71         if (pid == 0 || pid == getpid_cached())
  72                 return (unsigned char) 'R';
  73
  74         p = procfs_file_alloca(pid, "stat");
  75
  76         r = read_one_line_file(p, &line);
  77         if (r == -ENOENT)
  78                 return -ESRCH;
  79         if (r < 0)
  80                 return r;
  81
  82         p = strrchr(line, ')');
  83         if (!p)
  84                 return -EIO;
  85
  86         p++;
  87
  88         if (sscanf(p, " %c", &state) != 1)
  89                 return -EIO;
  90
  91         return (unsigned char) state;
  92 }
  93
  94 int pid_get_comm(pid_t pid, char **ret) {
  95         _cleanup_free_ char *escaped = NULL, *comm = NULL;
  96         int r;
  97
  98         assert(pid >= 0);
  99         assert(ret);
 100
 101         if (pid == 0 || pid == getpid_cached()) {
 102                 comm = new0(char, TASK_COMM_LEN + 1); /* Must fit in 16 byte according to prctl(2) */
 103                 if (!comm)
 104                         return -ENOMEM;
 105
 106                 if (prctl(PR_GET_NAME, comm) < 0)
 107                         return -errno;
 108         } else {
 109                 const char *p;
 110
 111                 p = procfs_file_alloca(pid, "comm");
 112
 113                 /* Note that process names of kernel threads can be much longer than TASK_COMM_LEN */
 114                 r = read_one_line_file(p, &comm);
 115                 if (r == -ENOENT)
 116                         return -ESRCH;
 117                 if (r < 0)
 118                         return r;
 119         }
 120
 121         escaped = new(char, COMM_MAX_LEN);
 122         if (!escaped)
 123                 return -ENOMEM;
 124
 125         /* Escape unprintable characters, just in case, but don't grow the string beyond the underlying size */
 126         cellescape(escaped, COMM_MAX_LEN, comm);
 127
 128         *ret = TAKE_PTR(escaped);
 129         return 0;
 130 }
 131
 132 int pidref_get_comm(const PidRef *pid, char **ret) {
 133         _cleanup_free_ char *comm = NULL;
 134         int r;
 135
 136         if (!pidref_is_set(pid))
 137                 return -ESRCH;
 138
 139         if (pidref_is_remote(pid))
 140                 return -EREMOTE;
 141
 142         r = pid_get_comm(pid->pid, &comm);
 143         if (r < 0)
 144                 return r;
 145
 146         r = pidref_verify(pid);
 147         if (r < 0)
 148                 return r;
 149
 150         if (ret)
 151                 *ret = TAKE_PTR(comm);
 152         return 0;
 153 }
 154
 155 static int pid_get_cmdline_nulstr(
 156                 pid_t pid,
 157                 size_t max_size,
 158                 ProcessCmdlineFlags flags,
 159                 char **ret,
 160                 size_t *ret_size) {
 161
 162         _cleanup_free_ char *t = NULL;
 163         const char *p;
 164         size_t k;
 165         int r;
 166
 167         /* Retrieves a process' command line as a "sized nulstr", i.e. possibly without the last NUL, but
 168          * with a specified size.
 169          *
 170          * If PROCESS_CMDLINE_COMM_FALLBACK is specified in flags and the process has no command line set
 171          * (the case for kernel threads), or has a command line that resolves to the empty string, will
 172          * return the "comm" name of the process instead. This will use at most _SC_ARG_MAX bytes of input
 173          * data.
 174          *
 175          * Returns an error, 0 if output was read but is truncated, 1 otherwise.
 176          */
 177
 178         p = procfs_file_alloca(pid, "cmdline");
 179         r = read_virtual_file(p, max_size, &t, &k); /* Let's assume that each input byte results in >= 1
 180                                                      * columns of output. We ignore zero-width codepoints. */
 181         if (r == -ENOENT)
 182                 return -ESRCH;
 183         if (r < 0)
 184                 return r;
 185
 186         if (k == 0) {
 187                 if (!(flags & PROCESS_CMDLINE_COMM_FALLBACK))
 188                         return -ENOENT;
 189
 190                 /* Kernel threads have no argv[] */
 191                 _cleanup_free_ char *comm = NULL;
 192
 193                 r = pid_get_comm(pid, &comm);
 194                 if (r < 0)
 195                         return r;
 196
 197                 free(t);
 198                 t = strjoin("[", comm, "]");
 199                 if (!t)
 200                         return -ENOMEM;
 201
 202                 k = strlen(t);
 203                 r = k <= max_size;
 204                 if (r == 0) /* truncation */
 205                         t[max_size] = '\0';
 206         }
 207
 208         if (ret)
 209                 *ret = TAKE_PTR(t);
 210         if (ret_size)
 211                 *ret_size = k;
 212
 213         return r;
 214 }
 215
 216 int pid_get_cmdline(pid_t pid, size_t max_columns, ProcessCmdlineFlags flags, char **ret) {
 217         _cleanup_free_ char *t = NULL;
 218         size_t k;
 219         char *ans;
 220
 221         assert(pid >= 0);
 222         assert(ret);
 223
 224         /* Retrieve and format a command line. See above for discussion of retrieval options.
 225          *
 226          * There are two main formatting modes:
 227          *
 228          * - when PROCESS_CMDLINE_QUOTE is specified, output is quoted in C/Python style. If no shell special
 229          *   characters are present, this output can be copy-pasted into the terminal to execute. UTF-8
 230          *   output is assumed.
 231          *
 232          * - otherwise, a compact non-roundtrippable form is returned. Non-UTF8 bytes are replaced by �. The
 233          *   returned string is of the specified console width at most, abbreviated with an ellipsis.
 234          *
 235          * Returns -ESRCH if the process doesn't exist, and -ENOENT if the process has no command line (and
 236          * PROCESS_CMDLINE_COMM_FALLBACK is not specified). Returns 0 and sets *line otherwise. */
 237
 238         int full = pid_get_cmdline_nulstr(pid, max_columns, flags, &t, &k);
 239         if (full < 0)
 240                 return full;
 241
 242         if (flags & (PROCESS_CMDLINE_QUOTE | PROCESS_CMDLINE_QUOTE_POSIX)) {
 243                 ShellEscapeFlags shflags = SHELL_ESCAPE_EMPTY |
 244                         FLAGS_SET(flags, PROCESS_CMDLINE_QUOTE_POSIX) * SHELL_ESCAPE_POSIX;
 245
 246                 assert(!(flags & PROCESS_CMDLINE_USE_LOCALE));
 247
 248                 _cleanup_strv_free_ char **args = NULL;
 249
 250                 /* Drop trailing NULs, otherwise strv_parse_nulstr() adds additional empty strings at the end.
 251                  * See also issue #21186. */
 252                 args = strv_parse_nulstr_full(t, k, /* drop_trailing_nuls = */ true);
 253                 if (!args)
 254                         return -ENOMEM;
 255
 256                 ans = quote_command_line(args, shflags);
 257                 if (!ans)
 258                         return -ENOMEM;
 259         } else {
 260                 /* Arguments are separated by NULs. Let's replace those with spaces. */
 261                 for (size_t i = 0; i < k - 1; i++)
 262                         if (t[i] == '\0')
 263                                 t[i] = ' ';
 264
 265                 delete_trailing_chars(t, WHITESPACE);
 266
 267                 bool eight_bit = (flags & PROCESS_CMDLINE_USE_LOCALE) && !is_locale_utf8();
 268
 269                 ans = escape_non_printable_full(t, max_columns,
 270                                                 eight_bit * XESCAPE_8_BIT | !full * XESCAPE_FORCE_ELLIPSIS);
 271                 if (!ans)
 272                         return -ENOMEM;
 273
 274                 ans = str_realloc(ans);
 275         }
 276
 277         *ret = ans;
 278         return 0;
 279 }
 280
 281 int pidref_get_cmdline(const PidRef *pid, size_t max_columns, ProcessCmdlineFlags flags, char **ret) {
 282         _cleanup_free_ char *s = NULL;
 283         int r;
 284
 285         if (!pidref_is_set(pid))
 286                 return -ESRCH;
 287
 288         if (pidref_is_remote(pid))
 289                 return -EREMOTE;
 290
 291         r = pid_get_cmdline(pid->pid, max_columns, flags, &s);
 292         if (r < 0)
 293                 return r;
 294
 295         r = pidref_verify(pid);
 296         if (r < 0)
 297                 return r;
 298
 299         if (ret)
 300                 *ret = TAKE_PTR(s);
 301         return 0;
 302 }
 303
 304 int pid_get_cmdline_strv(pid_t pid, ProcessCmdlineFlags flags, char ***ret) {
 305         _cleanup_free_ char *t = NULL;
 306         char **args;
 307         size_t k;
 308         int r;
 309
 310         assert(pid >= 0);
 311         assert((flags & ~PROCESS_CMDLINE_COMM_FALLBACK) == 0);
 312         assert(ret);
 313
 314         r = pid_get_cmdline_nulstr(pid, SIZE_MAX, flags, &t, &k);
 315         if (r < 0)
 316                 return r;
 317
 318         args = strv_parse_nulstr_full(t, k, /* drop_trailing_nuls = */ true);
 319         if (!args)
 320                 return -ENOMEM;
 321
 322         *ret = args;
 323         return 0;
 324 }
 325
 326 int pidref_get_cmdline_strv(const PidRef *pid, ProcessCmdlineFlags flags, char ***ret) {
 327         _cleanup_strv_free_ char **args = NULL;
 328         int r;
 329
 330         if (!pidref_is_set(pid))
 331                 return -ESRCH;
 332
 333         if (pidref_is_remote(pid))
 334                 return -EREMOTE;
 335
 336         r = pid_get_cmdline_strv(pid->pid, flags, &args);
 337         if (r < 0)
 338                 return r;
 339
 340         r = pidref_verify(pid);
 341         if (r < 0)
 342                 return r;
 343
 344         if (ret)
 345                 *ret = TAKE_PTR(args);
 346
 347         return 0;
 348 }
 349
 350 int container_get_leader(const char *machine, pid_t *pid) {
 351         _cleanup_free_ char *s = NULL, *class = NULL;
 352         const char *p;
 353         pid_t leader;
 354         int r;
 355
 356         assert(machine);
 357         assert(pid);
 358
 359         if (streq(machine, ".host")) {
 360                 *pid = 1;
 361                 return 0;
 362         }
 363
 364         if (!hostname_is_valid(machine, 0))
 365                 return -EINVAL;
 366
 367         p = strjoina("/run/systemd/machines/", machine);
 368         r = parse_env_file(NULL, p,
 369                            "LEADER", &s,
 370                            "CLASS", &class);
 371         if (r == -ENOENT)
 372                 return -EHOSTDOWN;
 373         if (r < 0)
 374                 return r;
 375         if (!s)
 376                 return -EIO;
 377
 378         if (!streq_ptr(class, "container"))
 379                 return -EIO;
 380
 381         r = parse_pid(s, &leader);
 382         if (r < 0)
 383                 return r;
 384         if (leader <= 1)
 385                 return -EIO;
 386
 387         *pid = leader;
 388         return 0;
 389 }
 390
 391 int pid_is_kernel_thread(pid_t pid) {
 392         _cleanup_free_ char *line = NULL;
 393         unsigned long long flags;
 394         size_t l, i;
 395         const char *p;
 396         char *q;
 397         int r;
 398
 399         if (IN_SET(pid, 0, 1) || pid == getpid_cached()) /* pid 1, and we ourselves certainly aren't a kernel thread */
 400                 return 0;
 401         if (!pid_is_valid(pid))
 402                 return -EINVAL;
 403
 404         p = procfs_file_alloca(pid, "stat");
 405         r = read_one_line_file(p, &line);
 406         if (r == -ENOENT)
 407                 return -ESRCH;
 408         if (r < 0)
 409                 return r;
 410
 411         /* Skip past the comm field */
 412         q = strrchr(line, ')');
 413         if (!q)
 414                 return -EINVAL;
 415         q++;
 416
 417         /* Skip 6 fields to reach the flags field */
 418         for (i = 0; i < 6; i++) {
 419                 l = strspn(q, WHITESPACE);
 420                 if (l < 1)
 421                         return -EINVAL;
 422                 q += l;
 423
 424                 l = strcspn(q, WHITESPACE);
 425                 if (l < 1)
 426                         return -EINVAL;
 427                 q += l;
 428         }
 429
 430         /* Skip preceding whitespace */
 431         l = strspn(q, WHITESPACE);
 432         if (l < 1)
 433                 return -EINVAL;
 434         q += l;
 435
 436         /* Truncate the rest */
 437         l = strcspn(q, WHITESPACE);
 438         if (l < 1)
 439                 return -EINVAL;
 440         q[l] = 0;
 441
 442         r = safe_atollu(q, &flags);
 443         if (r < 0)
 444                 return r;
 445
 446         return !!(flags & PF_KTHREAD);
 447 }
 448
 449 int pidref_is_kernel_thread(const PidRef *pid) {
 450         int result, r;
 451
 452         if (!pidref_is_set(pid))
 453                 return -ESRCH;
 454
 455         if (pidref_is_remote(pid))
 456                 return -EREMOTE;
 457
 458         result = pid_is_kernel_thread(pid->pid);
 459         if (result < 0)
 460                 return result;
 461
 462         r = pidref_verify(pid); /* Verify that the PID wasn't reused since */
 463         if (r < 0)
 464                 return r;
 465
 466         return result;
 467 }
 468
 469 static int get_process_link_contents(pid_t pid, const char *proc_file, char **ret) {
 470         const char *p;
 471         int r;
 472
 473         assert(proc_file);
 474
 475         p = procfs_file_alloca(pid, proc_file);
 476
 477         r = readlink_malloc(p, ret);
 478         return (r == -ENOENT && proc_mounted() > 0) ? -ESRCH : r;
 479 }
 480
 481 int get_process_exe(pid_t pid, char **ret) {
 482         char *d;
 483         int r;
 484
 485         assert(pid >= 0);
 486
 487         r = get_process_link_contents(pid, "exe", ret);
 488         if (r < 0)
 489                 return r;
 490
 491         if (ret) {
 492                 d = endswith(*ret, " (deleted)");
 493                 if (d)
 494                         *d = '\0';
 495         }
 496
 497         return 0;
 498 }
 499
 500 int pid_get_uid(pid_t pid, uid_t *ret) {
 501         int r;
 502
 503         assert(pid >= 0);
 504         assert(ret);
 505
 506         if (pid == 0 || pid == getpid_cached()) {
 507                 *ret = getuid();
 508                 return 0;
 509         }
 510
 511         _cleanup_free_ char *v = NULL;
 512         r = procfs_file_get_field(pid, "status", "Uid", &v);
 513         if (r == -ENOENT)
 514                 return -ESRCH;
 515         if (r < 0)
 516                 return r;
 517
 518         return parse_uid(v, ret);
 519 }
 520
 521 int pidref_get_uid(const PidRef *pid, uid_t *ret) {
 522         int r;
 523
 524         if (!pidref_is_set(pid))
 525                 return -ESRCH;
 526
 527         if (pidref_is_remote(pid))
 528                 return -EREMOTE;
 529
 530         if (pid->fd >= 0) {
 531                 r = pidfd_get_uid(pid->fd, ret);
 532                 if (!ERRNO_IS_NEG_NOT_SUPPORTED(r))
 533                         return r;
 534         }
 535
 536         uid_t uid;
 537         r = pid_get_uid(pid->pid, &uid);
 538         if (r < 0)
 539                 return r;
 540
 541         r = pidref_verify(pid);
 542         if (r < 0)
 543                 return r;
 544
 545         if (ret)
 546                 *ret = uid;
 547         return 0;
 548 }
 549
 550 int get_process_gid(pid_t pid, gid_t *ret) {
 551         int r;
 552
 553         assert(pid >= 0);
 554         assert(ret);
 555
 556         if (pid == 0 || pid == getpid_cached()) {
 557                 *ret = getgid();
 558                 return 0;
 559         }
 560
 561         _cleanup_free_ char *v = NULL;
 562         r = procfs_file_get_field(pid, "status", "Gid", &v);
 563         if (r == -ENOENT)
 564                 return -ESRCH;
 565         if (r < 0)
 566                 return r;
 567
 568         return parse_gid(v, ret);
 569 }
 570
 571 int get_process_cwd(pid_t pid, char **ret) {
 572         assert(pid >= 0);
 573
 574         if (pid == 0 || pid == getpid_cached())
 575                 return safe_getcwd(ret);
 576
 577         return get_process_link_contents(pid, "cwd", ret);
 578 }
 579
 580 int get_process_root(pid_t pid, char **ret) {
 581         assert(pid >= 0);
 582         return get_process_link_contents(pid, "root", ret);
 583 }
 584
 585 #define ENVIRONMENT_BLOCK_MAX (5U*1024U*1024U)
 586
 587 int get_process_environ(pid_t pid, char **ret) {
 588         _cleanup_fclose_ FILE *f = NULL;
 589         _cleanup_free_ char *outcome = NULL;
 590         size_t sz = 0;
 591         const char *p;
 592         int r;
 593
 594         assert(pid >= 0);
 595         assert(ret);
 596
 597         p = procfs_file_alloca(pid, "environ");
 598
 599         r = fopen_unlocked(p, "re", &f);
 600         if (r == -ENOENT)
 601                 return -ESRCH;
 602         if (r < 0)
 603                 return r;
 604
 605         for (;;) {
 606                 char c;
 607
 608                 if (sz >= ENVIRONMENT_BLOCK_MAX)
 609                         return -ENOBUFS;
 610
 611                 if (!GREEDY_REALLOC(outcome, sz + 5))
 612                         return -ENOMEM;
 613
 614                 r = safe_fgetc(f, &c);
 615                 if (r < 0)
 616                         return r;
 617                 if (r == 0)
 618                         break;
 619
 620                 if (c == '\0')
 621                         outcome[sz++] = '\n';
 622                 else
 623                         sz += cescape_char(c, outcome + sz);
 624         }
 625
 626         outcome[sz] = '\0';
 627         *ret = TAKE_PTR(outcome);
 628
 629         return 0;
 630 }
 631
 632 int pid_get_ppid(pid_t pid, pid_t *ret) {
 633         _cleanup_free_ char *line = NULL;
 634         unsigned long ppid;
 635         const char *p;
 636         int r;
 637
 638         assert(pid >= 0);
 639
 640         if (pid == 0)
 641                 pid = getpid_cached();
 642         if (pid == 1) /* PID 1 has no parent, shortcut this case */
 643                 return -EADDRNOTAVAIL;
 644
 645         if (pid == getpid_cached()) {
 646                 if (ret)
 647                         *ret = getppid();
 648                 return 0;
 649         }
 650
 651         p = procfs_file_alloca(pid, "stat");
 652         r = read_one_line_file(p, &line);
 653         if (r == -ENOENT)
 654                 return -ESRCH;
 655         if (r < 0)
 656                 return r;
 657
 658         /* Let's skip the pid and comm fields. The latter is enclosed in () but does not escape any () in its
 659          * value, so let's skip over it manually */
 660
 661         p = strrchr(line, ')');
 662         if (!p)
 663                 return -EIO;
 664         p++;
 665
 666         if (sscanf(p, " "
 667                    "%*c "  /* state */
 668                    "%lu ", /* ppid */
 669                    &ppid) != 1)
 670                 return -EIO;
 671
 672         /* If ppid is zero the process has no parent. Which might be the case for PID 1 (caught above)
 673          * but also for processes originating in other namespaces that are inserted into a pidns.
 674          * Return a recognizable error in this case. */
 675         if (ppid == 0)
 676                 return -EADDRNOTAVAIL;
 677
 678         if ((pid_t) ppid < 0 || (unsigned long) (pid_t) ppid != ppid)
 679                 return -ERANGE;
 680
 681         if (ret)
 682                 *ret = (pid_t) ppid;
 683
 684         return 0;
 685 }
 686
 687 int pidref_get_ppid(const PidRef *pidref, pid_t *ret) {
 688         int r;
 689
 690         if (!pidref_is_set(pidref))
 691                 return -ESRCH;
 692
 693         if (pidref_is_remote(pidref))
 694                 return -EREMOTE;
 695
 696         if (pidref->fd >= 0) {
 697                 r = pidfd_get_ppid(pidref->fd, ret);
 698                 if (!ERRNO_IS_NEG_NOT_SUPPORTED(r))
 699                         return r;
 700         }
 701
 702         pid_t ppid;
 703         r = pid_get_ppid(pidref->pid, ret ? &ppid : NULL);
 704         if (r < 0)
 705                 return r;
 706
 707         r = pidref_verify(pidref);
 708         if (r < 0)
 709                 return r;
 710
 711         if (ret)
 712                 *ret = ppid;
 713         return 0;
 714 }
 715
 716 int pidref_get_ppid_as_pidref(const PidRef *pidref, PidRef *ret) {
 717         pid_t ppid;
 718         int r;
 719
 720         assert(ret);
 721
 722         r = pidref_get_ppid(pidref, &ppid);
 723         if (r < 0)
 724                 return r;
 725
 726         for (unsigned attempt = 0; attempt < 16; attempt++) {
 727                 _cleanup_(pidref_done) PidRef parent = PIDREF_NULL;
 728
 729                 r = pidref_set_pid(&parent, ppid);
 730                 if (r < 0)
 731                         return r;
 732
 733                 /* If we have a pidfd of the original PID, let's verify that the process we acquired really
 734                  * is the parent still */
 735                 if (pidref->fd >= 0) {
 736                         r = pidref_get_ppid(pidref, &ppid);
 737                         if (r < 0)
 738                                 return r;
 739
 740                         /* Did the PPID change since we queried it? if so we might have pinned the wrong
 741                          * process, if its PID got reused by now. Let's try again */
 742                         if (parent.pid != ppid)
 743                                 continue;
 744                 }
 745
 746                 *ret = TAKE_PIDREF(parent);
 747                 return 0;
 748         }
 749
 750         /* Give up after 16 tries */
 751         return -ENOTRECOVERABLE;
 752 }
 753
 754 int pid_get_start_time(pid_t pid, usec_t *ret) {
 755         _cleanup_free_ char *line = NULL;
 756         const char *p;
 757         int r;
 758
 759         assert(pid >= 0);
 760
 761         p = procfs_file_alloca(pid, "stat");
 762         r = read_one_line_file(p, &line);
 763         if (r == -ENOENT)
 764                 return -ESRCH;
 765         if (r < 0)
 766                 return r;
 767
 768         /* Let's skip the pid and comm fields. The latter is enclosed in () but does not escape any () in its
 769          * value, so let's skip over it manually */
 770
 771         p = strrchr(line, ')');
 772         if (!p)
 773                 return -EIO;
 774         p++;
 775
 776         unsigned long llu;
 777
 778         if (sscanf(p, " "
 779                    "%*c " /* state */
 780                    "%*u " /* ppid */
 781                    "%*u " /* pgrp */
 782                    "%*u " /* session */
 783                    "%*u " /* tty_nr */
 784                    "%*u " /* tpgid */
 785                    "%*u " /* flags */
 786                    "%*u " /* minflt */
 787                    "%*u " /* cminflt */
 788                    "%*u " /* majflt */
 789                    "%*u " /* cmajflt */
 790                    "%*u " /* utime */
 791                    "%*u " /* stime */
 792                    "%*u " /* cutime */
 793                    "%*u " /* cstime */
 794                    "%*i " /* priority */
 795                    "%*i " /* nice */
 796                    "%*u " /* num_threads */
 797                    "%*u " /* itrealvalue */
 798                    "%lu ", /* starttime */
 799                    &llu) != 1)
 800                 return -EIO;
 801
 802         if (ret)
 803                 *ret = jiffies_to_usec(llu); /* CLOCK_BOOTTIME */
 804
 805         return 0;
 806 }
 807
 808 int pidref_get_start_time(const PidRef *pid, usec_t *ret) {
 809         usec_t t;
 810         int r;
 811
 812         if (!pidref_is_set(pid))
 813                 return -ESRCH;
 814
 815         if (pidref_is_remote(pid))
 816                 return -EREMOTE;
 817
 818         r = pid_get_start_time(pid->pid, ret ? &t : NULL);
 819         if (r < 0)
 820                 return r;
 821
 822         r = pidref_verify(pid);
 823         if (r < 0)
 824                 return r;
 825
 826         if (ret)
 827                 *ret = t;
 828
 829         return 0;
 830 }
 831
 832 int get_process_umask(pid_t pid, mode_t *ret) {
 833         _cleanup_free_ char *m = NULL;
 834         int r;
 835
 836         assert(pid >= 0);
 837         assert(ret);
 838
 839         r = procfs_file_get_field(pid, "status", "Umask", &m);
 840         if (r == -ENOENT)
 841                 return -ESRCH;
 842         if (r < 0)
 843                 return r;
 844
 845         return parse_mode(m, ret);
 846 }
 847
 848 int wait_for_terminate(pid_t pid, siginfo_t *ret) {
 849         return pidref_wait_for_terminate(&PIDREF_MAKE_FROM_PID(pid), ret);
 850 }
 851
 852 /*
 853  * Return values:
 854  * < 0 : wait_for_terminate() failed to get the state of the
 855  *       process, the process was terminated by a signal, or
 856  *       failed for an unknown reason.
 857  * >=0 : The process terminated normally, and its exit code is
 858  *       returned.
 859  *
 860  * That is, success is indicated by a return value of zero, and an
 861  * error is indicated by a non-zero value.
 862  *
 863  * A warning is emitted if the process terminates abnormally,
 864  * and also if it returns non-zero unless check_exit_code is true.
 865  */
 866 int pidref_wait_for_terminate_and_check(const char *name, PidRef *pidref, WaitFlags flags) {
 867         int r;
 868
 869         if (!pidref_is_set(pidref))
 870                 return -ESRCH;
 871         if (pidref_is_remote(pidref))
 872                 return -EREMOTE;
 873         if (pidref->pid == 1 || pidref_is_self(pidref))
 874                 return -ECHILD;
 875
 876         _cleanup_free_ char *buffer = NULL;
 877         if (!name) {
 878                 r = pidref_get_comm(pidref, &buffer);
 879                 if (r < 0)
 880                         log_debug_errno(r, "Failed to acquire process name of " PID_FMT ", ignoring: %m", pidref->pid);
 881                 else
 882                         name = buffer;
 883         }
 884
 885         int prio = flags & WAIT_LOG_ABNORMAL ? LOG_ERR : LOG_DEBUG;
 886
 887         siginfo_t status;
 888         r = pidref_wait_for_terminate(pidref, &status);
 889         if (r < 0)
 890                 return log_full_errno(prio, r, "Failed to wait for %s: %m", strna(name));
 891
 892         if (status.si_code == CLD_EXITED) {
 893                 if (status.si_status != EXIT_SUCCESS)
 894                         log_full(flags & WAIT_LOG_NON_ZERO_EXIT_STATUS ? LOG_ERR : LOG_DEBUG,
 895                                  "%s failed with exit status %i.", strna(name), status.si_status);
 896                 else
 897                         log_debug("%s succeeded.", name);
 898
 899                 return status.si_status;
 900
 901         } else if (IN_SET(status.si_code, CLD_KILLED, CLD_DUMPED)) {
 902
 903                 log_full(prio, "%s terminated by signal %s.", strna(name), signal_to_string(status.si_status));
 904                 return -EPROTO;
 905         }
 906
 907         log_full(prio, "%s failed due to unknown reason.", strna(name));
 908         return -EPROTO;
 909 }
 910
 911 int wait_for_terminate_and_check(const char *name, pid_t pid, WaitFlags flags) {
 912         return pidref_wait_for_terminate_and_check(name, &PIDREF_MAKE_FROM_PID(pid), flags);
 913 }
 914
 915 /*
 916  * Return values:
 917  *
 918  * < 0 : wait_for_terminate_with_timeout() failed to get the state of the process, the process timed out, the process
 919  *       was terminated by a signal, or failed for an unknown reason.
 920  *
 921  * >=0 : The process terminated normally with no failures.
 922  *
 923  * Success is indicated by a return value of zero, a timeout is indicated by ETIMEDOUT, and all other child failure
 924  * states are indicated by error is indicated by a non-zero value.
 925  *
 926  * This call assumes SIGCHLD has been blocked already, in particular before the child to wait for has been forked off
 927  * to remain entirely race-free.
 928  */
 929 int wait_for_terminate_with_timeout(pid_t pid, usec_t timeout) {
 930         sigset_t mask;
 931         int r;
 932         usec_t until;
 933
 934         assert_se(sigemptyset(&mask) == 0);
 935         assert_se(sigaddset(&mask, SIGCHLD) == 0);
 936
 937         /* Drop into a sigtimewait-based timeout. Waiting for the
 938          * pid to exit. */
 939         until = usec_add(now(CLOCK_MONOTONIC), timeout);
 940         for (;;) {
 941                 usec_t n;
 942                 siginfo_t status = {};
 943
 944                 n = now(CLOCK_MONOTONIC);
 945                 if (n >= until)
 946                         break;
 947
 948                 r = RET_NERRNO(sigtimedwait(&mask, NULL, TIMESPEC_STORE(until - n)));
 949                 /* Assuming we woke due to the child exiting. */
 950                 if (waitid(P_PID, pid, &status, WEXITED|WNOHANG) == 0) {
 951                         if (status.si_pid == pid) {
 952                                 /* This is the correct child. */
 953                                 if (status.si_code == CLD_EXITED)
 954                                         return status.si_status == 0 ? 0 : -EPROTO;
 955                                 else
 956                                         return -EPROTO;
 957                         }
 958                 }
 959                 /* Not the child, check for errors and proceed appropriately */
 960                 if (r < 0) {
 961                         switch (r) {
 962                         case -EAGAIN:
 963                                 /* Timed out, child is likely hung. */
 964                                 return -ETIMEDOUT;
 965                         case -EINTR:
 966                                 /* Received a different signal and should retry */
 967                                 continue;
 968                         default:
 969                                 /* Return any unexpected errors */
 970                                 return r;
 971                         }
 972                 }
 973         }
 974
 975         return -EPROTO;
 976 }
 977
 978 void sigkill_wait(pid_t pid) {
 979         assert(pid > 1);
 980
 981         (void) kill(pid, SIGKILL);
 982         (void) wait_for_terminate(pid, NULL);
 983 }
 984
 985 void sigkill_waitp(pid_t *pid) {
 986         PROTECT_ERRNO;
 987
 988         if (!pid)
 989                 return;
 990         if (*pid <= 1)
 991                 return;
 992
 993         sigkill_wait(*pid);
 994 }
 995
 996 void sigterm_wait(pid_t pid) {
 997         assert(pid > 1);
 998
 999         (void) kill_and_sigcont(pid, SIGTERM);
1000         (void) wait_for_terminate(pid, NULL);
1001 }
1002
1003 void sigkill_nowait(pid_t pid) {
1004         assert(pid > 1);
1005
1006         (void) kill(pid, SIGKILL);
1007 }
1008
1009 void sigkill_nowaitp(pid_t *pid) {
1010         PROTECT_ERRNO;
1011
1012         if (!pid)
1013                 return;
1014         if (*pid <= 1)
1015                 return;
1016
1017         sigkill_nowait(*pid);
1018 }
1019
1020 int kill_and_sigcont(pid_t pid, int sig) {
1021         int r;
1022
1023         r = RET_NERRNO(kill(pid, sig));
1024
1025         /* If this worked, also send SIGCONT, unless we already just sent a SIGCONT, or SIGKILL was sent which isn't
1026          * affected by a process being suspended anyway. */
1027         if (r >= 0 && !IN_SET(sig, SIGCONT, SIGKILL))
1028                 (void) kill(pid, SIGCONT);
1029
1030         return r;
1031 }
1032
1033 int getenv_for_pid(pid_t pid, const char *field, char **ret) {
1034         _cleanup_fclose_ FILE *f = NULL;
1035         const char *path;
1036         size_t sum = 0;
1037         int r;
1038
1039         assert(pid >= 0);
1040         assert(field);
1041         assert(ret);
1042
1043         if (pid == 0 || pid == getpid_cached())
1044                 return strdup_to_full(ret, getenv(field));
1045
1046         if (!pid_is_valid(pid))
1047                 return -EINVAL;
1048
1049         path = procfs_file_alloca(pid, "environ");
1050
1051         r = fopen_unlocked(path, "re", &f);
1052         if (r == -ENOENT)
1053                 return -ESRCH;
1054         if (r < 0)
1055                 return r;
1056
1057         for (;;) {
1058                 _cleanup_free_ char *line = NULL;
1059                 const char *match;
1060
1061                 if (sum > ENVIRONMENT_BLOCK_MAX) /* Give up searching eventually */
1062                         return -ENOBUFS;
1063
1064                 r = read_nul_string(f, LONG_LINE_MAX, &line);
1065                 if (r < 0)
1066                         return r;
1067                 if (r == 0)  /* EOF */
1068                         break;
1069
1070                 sum += r;
1071
1072                 match = startswith(line, field);
1073                 if (match && *match == '=')
1074                         return strdup_to_full(ret, match + 1);
1075         }
1076
1077         *ret = NULL;
1078         return 0;
1079 }
1080
1081 int pidref_is_my_child(PidRef *pid) {
1082         int r;
1083
1084         if (!pidref_is_set(pid))
1085                 return -ESRCH;
1086
1087         if (pidref_is_remote(pid))
1088                 return -EREMOTE;
1089
1090         if (pid->pid == 1 || pidref_is_self(pid))
1091                 return false;
1092
1093         pid_t ppid;
1094         r = pidref_get_ppid(pid, &ppid);
1095         if (r == -EADDRNOTAVAIL) /* if this process is outside of our pidns, it is definitely not our child */
1096                 return false;
1097         if (r < 0)
1098                 return r;
1099
1100         return ppid == getpid_cached();
1101 }
1102
1103 int pid_is_my_child(pid_t pid) {
1104
1105         if (pid == 0)
1106                 return false;
1107
1108         return pidref_is_my_child(&PIDREF_MAKE_FROM_PID(pid));
1109 }
1110
1111 int pidref_is_unwaited(PidRef *pid) {
1112         int r;
1113
1114         /* Checks whether a PID is still valid at all, including a zombie */
1115
1116         if (!pidref_is_set(pid))
1117                 return -ESRCH;
1118
1119         if (pidref_is_remote(pid))
1120                 return -EREMOTE;
1121
1122         if (pid->pid == 1 || pidref_is_self(pid))
1123                 return true;
1124
1125         r = pidref_kill(pid, 0);
1126         if (r == -ESRCH)
1127                 return false;
1128         if (r < 0)
1129                 return r;
1130
1131         return true;
1132 }
1133
1134 int pid_is_unwaited(pid_t pid) {
1135
1136         if (pid == 0)
1137                 return true;
1138
1139         return pidref_is_unwaited(&PIDREF_MAKE_FROM_PID(pid));
1140 }
1141
1142 int pid_is_alive(pid_t pid) {
1143         int r;
1144
1145         /* Checks whether a PID is still valid and not a zombie */
1146
1147         if (pid < 0)
1148                 return -ESRCH;
1149
1150         if (pid <= 1) /* If we or PID 1 would be a zombie, this code would not be running */
1151                 return true;
1152
1153         if (pid == getpid_cached())
1154                 return true;
1155
1156         r = get_process_state(pid);
1157         if (r == -ESRCH)
1158                 return false;
1159         if (r < 0)
1160                 return r;
1161
1162         return r != 'Z';
1163 }
1164
1165 int pidref_is_alive(const PidRef *pidref) {
1166         int r, result;
1167
1168         if (!pidref_is_set(pidref))
1169                 return -ESRCH;
1170
1171         if (pidref_is_remote(pidref))
1172                 return -EREMOTE;
1173
1174         result = pid_is_alive(pidref->pid);
1175         if (result < 0) {
1176                 assert(result != -ESRCH);
1177                 return result;
1178         }
1179
1180         r = pidref_verify(pidref);
1181         if (r == -ESRCH)
1182                 return false;
1183         if (r < 0)
1184                 return r;
1185
1186         return result;
1187 }
1188
1189 int pidref_from_same_root_fs(PidRef *a, PidRef *b) {
1190         _cleanup_(pidref_done) PidRef self = PIDREF_NULL;
1191         int r;
1192
1193         /* Checks if the two specified processes have the same root fs. Either can be specified as NULL in
1194          * which case we'll check against ourselves. */
1195
1196         if (!a || !b) {
1197                 r = pidref_set_self(&self);
1198                 if (r < 0)
1199                         return r;
1200                 if (!a)
1201                         a = &self;
1202                 if (!b)
1203                         b = &self;
1204         }
1205
1206         if (!pidref_is_set(a) || !pidref_is_set(b))
1207                 return -ESRCH;
1208
1209         /* If one of the two processes have the same root they cannot have the same root fs, but if both of
1210          * them do we don't know */
1211         if (pidref_is_remote(a) && pidref_is_remote(b))
1212                 return -EREMOTE;
1213         if (pidref_is_remote(a) || pidref_is_remote(b))
1214                 return false;
1215
1216         if (pidref_equal(a, b))
1217                 return true;
1218
1219         const char *roota = procfs_file_alloca(a->pid, "root");
1220         const char *rootb = procfs_file_alloca(b->pid, "root");
1221
1222         int result = inode_same(roota, rootb, 0);
1223         if (result == -ENOENT)
1224                 return proc_mounted() == 0 ? -ENOSYS : -ESRCH;
1225         if (result < 0)
1226                 return result;
1227
1228         r = pidref_verify(a);
1229         if (r < 0)
1230                 return r;
1231         r = pidref_verify(b);
1232         if (r < 0)
1233                 return r;
1234
1235         return result;
1236 }
1237
1238 bool is_main_thread(void) {
1239         static thread_local int cached = -1;
1240
1241         if (cached < 0)
1242                 cached = getpid_cached() == gettid();
1243
1244         return cached;
1245 }
1246
1247 bool oom_score_adjust_is_valid(int oa) {
1248         return oa >= OOM_SCORE_ADJ_MIN && oa <= OOM_SCORE_ADJ_MAX;
1249 }
1250
1251 unsigned long personality_from_string(const char *p) {
1252         Architecture architecture;
1253
1254         if (!p)
1255                 return PERSONALITY_INVALID;
1256
1257         /* Parse a personality specifier. We use our own identifiers that indicate specific ABIs, rather than just
1258          * hints regarding the register size, since we want to keep things open for multiple locally supported ABIs for
1259          * the same register size. */
1260
1261         architecture = architecture_from_string(p);
1262         if (architecture < 0)
1263                 return PERSONALITY_INVALID;
1264
1265         if (architecture == native_architecture())
1266                 return PER_LINUX;
1267 #ifdef ARCHITECTURE_SECONDARY
1268         if (architecture == ARCHITECTURE_SECONDARY)
1269                 return PER_LINUX32;
1270 #endif
1271
1272         return PERSONALITY_INVALID;
1273 }
1274
1275 const char* personality_to_string(unsigned long p) {
1276         Architecture architecture = _ARCHITECTURE_INVALID;
1277
1278         if (p == PER_LINUX)
1279                 architecture = native_architecture();
1280 #ifdef ARCHITECTURE_SECONDARY
1281         else if (p == PER_LINUX32)
1282                 architecture = ARCHITECTURE_SECONDARY;
1283 #endif
1284
1285         if (architecture < 0)
1286                 return NULL;
1287
1288         return architecture_to_string(architecture);
1289 }
1290
1291 int safe_personality(unsigned long p) {
1292         int ret;
1293
1294         /* So here's the deal, personality() is weirdly defined by glibc. In some cases it returns a failure via errno,
1295          * and in others as negative return value containing an errno-like value. Let's work around this: this is a
1296          * wrapper that uses errno if it is set, and uses the return value otherwise. And then it sets both errno and
1297          * the return value indicating the same issue, so that we are definitely on the safe side.
1298          *
1299          * See https://github.com/systemd/systemd/issues/6737 */
1300
1301         errno = 0;
1302         ret = personality(p);
1303         if (ret < 0) {
1304                 if (errno != 0)
1305                         return -errno;
1306
1307                 errno = -ret;
1308         }
1309
1310         return ret;
1311 }
1312
1313 int opinionated_personality(unsigned long *ret) {
1314         int current;
1315
1316         /* Returns the current personality, or PERSONALITY_INVALID if we can't determine it. This function is a bit
1317          * opinionated though, and ignores all the finer-grained bits and exotic personalities, only distinguishing the
1318          * two most relevant personalities: PER_LINUX and PER_LINUX32. */
1319
1320         current = safe_personality(PERSONALITY_INVALID);
1321         if (current < 0)
1322                 return current;
1323
1324         if (((unsigned long) current & OPINIONATED_PERSONALITY_MASK) == PER_LINUX32)
1325                 *ret = PER_LINUX32;
1326         else
1327                 *ret = PER_LINUX;
1328
1329         return 0;
1330 }
1331
1332 void valgrind_summary_hack(void) {
1333 #if HAVE_VALGRIND_VALGRIND_H
1334         if (getpid_cached() == 1 && RUNNING_ON_VALGRIND) {
1335                 pid_t pid;
1336                 pid = raw_clone(SIGCHLD);
1337                 if (pid < 0)
1338                         log_struct_errno(
1339                                 LOG_EMERG, errno,
1340                                 LOG_MESSAGE_ID(SD_MESSAGE_VALGRIND_HELPER_FORK_STR),
1341                                 LOG_MESSAGE("Failed to fork off valgrind helper: %m"));
1342                 else if (pid == 0)
1343                         exit(EXIT_SUCCESS);
1344                 else {
1345                         log_info("Spawned valgrind helper as PID "PID_FMT".", pid);
1346                         (void) wait_for_terminate(pid, NULL);
1347                 }
1348         }
1349 #endif
1350 }
1351
1352 int pid_compare_func(const pid_t *a, const pid_t *b) {
1353         /* Suitable for usage in qsort() */
1354         return CMP(*a, *b);
1355 }
1356
1357 bool nice_is_valid(int n) {
1358         return n >= PRIO_MIN && n < PRIO_MAX;
1359 }
1360
1361 bool sched_policy_is_valid(int i) {
1362         return IN_SET(i, SCHED_OTHER, SCHED_BATCH, SCHED_IDLE, SCHED_FIFO, SCHED_RR);
1363 }
1364
1365 bool sched_priority_is_valid(int i) {
1366         return i >= 0 && i <= sched_get_priority_max(SCHED_RR);
1367 }
1368
1369 /* The cached PID, possible values:
1370  *
1371  *     == UNSET [0]  → cache not initialized yet
1372  *     == BUSY [-1]  → some thread is initializing it at the moment
1373  *     any other     → the cached PID
1374  */
1375
1376 #define CACHED_PID_UNSET ((pid_t) 0)
1377 #define CACHED_PID_BUSY ((pid_t) -1)
1378
1379 static pid_t cached_pid = CACHED_PID_UNSET;
1380
1381 void reset_cached_pid(void) {
1382         /* Invoked in the child after a fork(), i.e. at the first moment the PID changed */
1383         cached_pid = CACHED_PID_UNSET;
1384 }
1385
1386 pid_t getpid_cached(void) {
1387         static bool installed = false;
1388         pid_t current_value = CACHED_PID_UNSET;
1389
1390         /* getpid_cached() is much like getpid(), but caches the value in local memory, to avoid having to invoke a
1391          * system call each time. This restores glibc behaviour from before 2.24, when getpid() was unconditionally
1392          * cached. Starting with 2.24 getpid() started to become prohibitively expensive when used for detecting when
1393          * objects were used across fork()s. With this caching the old behaviour is somewhat restored.
1394          *
1395          * https://bugzilla.redhat.com/show_bug.cgi?id=1443976
1396          * https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=c579f48edba88380635ab98cb612030e3ed8691e
1397          */
1398
1399         (void) __atomic_compare_exchange_n(
1400                         &cached_pid,
1401                         &current_value,
1402                         CACHED_PID_BUSY,
1403                         false,
1404                         __ATOMIC_SEQ_CST,
1405                         __ATOMIC_SEQ_CST);
1406
1407         switch (current_value) {
1408
1409         case CACHED_PID_UNSET: { /* Not initialized yet, then do so now */
1410                 pid_t new_pid;
1411
1412                 new_pid = getpid();
1413
1414                 if (!installed) {
1415                         /* __register_atfork() either returns 0 or -ENOMEM, in its glibc implementation. Since it's
1416                          * only half-documented (glibc doesn't document it but LSB does — though only superficially)
1417                          * we'll check for errors only in the most generic fashion possible. */
1418
1419                         if (pthread_atfork(NULL, NULL, reset_cached_pid) != 0) {
1420                                 /* OOM? Let's try again later */
1421                                 cached_pid = CACHED_PID_UNSET;
1422                                 return new_pid;
1423                         }
1424
1425                         installed = true;
1426                 }
1427
1428                 cached_pid = new_pid;
1429                 return new_pid;
1430         }
1431
1432         case CACHED_PID_BUSY: /* Somebody else is currently initializing */
1433                 return getpid();
1434
1435         default: /* Properly initialized */
1436                 return current_value;
1437         }
1438 }
1439
1440 int must_be_root(void) {
1441
1442         if (geteuid() == 0)
1443                 return 0;
1444
1445         return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Need to be root.");
1446 }
1447
1448 pid_t clone_with_nested_stack(int (*fn)(void *), int flags, void *userdata) {
1449         size_t ps;
1450         pid_t pid;
1451         void *mystack;
1452
1453         /* A wrapper around glibc's clone() call that automatically sets up a "nested" stack. Only supports
1454          * invocations without CLONE_VM, so that we can continue to use the parent's stack mapping.
1455          *
1456          * Note: glibc's clone() wrapper does not synchronize malloc() locks. This means that if the parent
1457          * is threaded these locks will be in an undefined state in the child, and hence memory allocations
1458          * are likely going to run into deadlocks. Hence: if you use this function make sure your parent is
1459          * strictly single-threaded or your child never calls malloc(). */
1460
1461         assert((flags & (CLONE_VM|CLONE_PARENT_SETTID|CLONE_CHILD_SETTID|
1462                          CLONE_CHILD_CLEARTID|CLONE_SETTLS)) == 0);
1463
1464         /* We allocate some space on the stack to use as the stack for the child (hence "nested"). Note that
1465          * the net effect is that the child will have the start of its stack inside the stack of the parent,
1466          * but since they are a CoW copy of each other that's fine. We allocate one page-aligned page. But
1467          * since we don't want to deal with differences between systems where the stack grows backwards or
1468          * forwards we'll allocate one more and place the stack address in the middle. Except that we also
1469          * want it page aligned, hence we'll allocate one page more. Makes 3. */
1470
1471         ps = page_size();
1472         mystack = alloca(ps*3);
1473         mystack = (uint8_t*) mystack + ps; /* move pointer one page ahead since stacks usually grow backwards */
1474         mystack = (void*) ALIGN_TO((uintptr_t) mystack, ps); /* align to page size (moving things further ahead) */
1475
1476 #if HAVE_CLONE
1477         pid = clone(fn, mystack, flags, userdata);
1478 #else
1479         pid = __clone2(fn, mystack, ps, flags, userdata);
1480 #endif
1481         if (pid < 0)
1482                 return -errno;
1483
1484         return pid;
1485 }
1486
1487 static void restore_sigsetp(sigset_t **ssp) {
1488         if (*ssp)
1489                 (void) sigprocmask(SIG_SETMASK, *ssp, NULL);
1490 }
1491
1492 static int fork_flags_to_signal(ForkFlags flags) {
1493         return (flags & FORK_DEATHSIG_SIGTERM) ? SIGTERM :
1494                 (flags & FORK_DEATHSIG_SIGINT) ? SIGINT :
1495                                                  SIGKILL;
1496 }
1497
1498 int pidref_safe_fork_full(
1499                 const char *name,
1500                 const int stdio_fds[3],
1501                 int except_fds[],
1502                 size_t n_except_fds,
1503                 ForkFlags flags,
1504                 PidRef *ret_pid) {
1505
1506         pid_t original_pid, pid;
1507         sigset_t saved_ss, ss;
1508         _unused_ _cleanup_(restore_sigsetp) sigset_t *saved_ssp = NULL;
1509         bool block_signals = false, block_all = false, intermediary = false;
1510         _cleanup_close_pair_ int pidref_transport_fds[2] = EBADF_PAIR;
1511         int prio, r;
1512
1513         assert(!FLAGS_SET(flags, FORK_WAIT|FORK_FREEZE));
1514         assert(!FLAGS_SET(flags, FORK_DETACH) ||
1515                (flags & (FORK_WAIT|FORK_DEATHSIG_SIGTERM|FORK_DEATHSIG_SIGINT|FORK_DEATHSIG_SIGKILL)) == 0);
1516
1517         /* A wrapper around fork(), that does a couple of important initializations in addition to mere
1518          * forking. If provided, ret_pid is initialized in both the parent and the child process, both times
1519          * referencing the child process. Returns == 0 in the child and > 0 in the parent. */
1520
1521         prio = flags & FORK_LOG ? LOG_ERR : LOG_DEBUG;
1522
1523         original_pid = getpid_cached();
1524
1525         if (flags & FORK_FLUSH_STDIO) {
1526                 fflush(stdout);
1527                 fflush(stderr); /* This one shouldn't be necessary, stderr should be unbuffered anyway, but let's better be safe than sorry */
1528         }
1529
1530         if (flags & (FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|FORK_DEATHSIG_SIGINT)) {
1531                 /* We temporarily block all signals, so that the new child has them blocked initially. This
1532                  * way, we can be sure that SIGTERMs are not lost we might send to the child. (Note that for
1533                  * FORK_DEATHSIG_SIGKILL we don't bother, since it cannot be blocked anyway.) */
1534
1535                 assert_se(sigfillset(&ss) >= 0);
1536                 block_signals = block_all = true;
1537
1538         } else if (flags & FORK_WAIT) {
1539                 /* Let's block SIGCHLD at least, so that we can safely watch for the child process */
1540
1541                 assert_se(sigemptyset(&ss) >= 0);
1542                 assert_se(sigaddset(&ss, SIGCHLD) >= 0);
1543                 block_signals = true;
1544         }
1545
1546         if (block_signals) {
1547                 if (sigprocmask(SIG_BLOCK, &ss, &saved_ss) < 0)
1548                         return log_full_errno(prio, errno, "Failed to block signal mask: %m");
1549                 saved_ssp = &saved_ss;
1550         }
1551
1552         if (FLAGS_SET(flags, FORK_DETACH)) {
1553                 /* Fork off intermediary child if needed */
1554
1555                 r = is_reaper_process();
1556                 if (r < 0)
1557                         return log_full_errno(prio, r, "Failed to determine if we are a reaper process: %m");
1558
1559                 if (!r) {
1560                         /* Not a reaper process, hence do a double fork() so we are reparented to one */
1561
1562                         if (ret_pid && socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pidref_transport_fds) < 0)
1563                                 return log_full_errno(prio, errno, "Failed to allocate pidref socket: %m");
1564
1565                         pid = fork();
1566                         if (pid < 0)
1567                                 return log_full_errno(prio, errno, "Failed to fork off '%s': %m", strna(name));
1568                         if (pid > 0) {
1569                                 log_debug("Successfully forked off intermediary '%s' as PID " PID_FMT ".", strna(name), pid);
1570
1571                                 pidref_transport_fds[1] = safe_close(pidref_transport_fds[1]);
1572
1573                                 if (pidref_transport_fds[0] >= 0) {
1574                                         /* Wait for the intermediary child to exit so the caller can be certain the actual child
1575                                          * process has been reparented by the time this function returns. */
1576                                         r = wait_for_terminate_and_check(name, pid, FLAGS_SET(flags, FORK_LOG) ? WAIT_LOG : 0);
1577                                         if (r < 0)
1578                                                 return log_full_errno(prio, r, "Failed to wait for intermediary process: %m");
1579                                         if (r != EXIT_SUCCESS) /* exit status > 0 should be treated as failure, too */
1580                                                 return -EPROTO;
1581
1582                                         int pidfd;
1583                                         ssize_t n = receive_one_fd_iov(
1584                                                         pidref_transport_fds[0],
1585                                                         &IOVEC_MAKE(&pid, sizeof(pid)),
1586                                                         /* iovlen= */ 1,
1587                                                         /* flags= */ 0,
1588                                                         &pidfd);
1589                                         if (n < 0)
1590                                                 return log_full_errno(prio, n, "Failed to receive child pidref: %m");
1591
1592                                         *ret_pid = (PidRef) { .pid = pid, .fd = pidfd };
1593                                 }
1594
1595                                 return 1; /* return in the parent */
1596                         }
1597
1598                         pidref_transport_fds[0] = safe_close(pidref_transport_fds[0]);
1599                         intermediary = true;
1600                 }
1601         }
1602
1603         if ((flags & (FORK_NEW_MOUNTNS|FORK_NEW_USERNS|FORK_NEW_NETNS|FORK_NEW_PIDNS)) != 0)
1604                 pid = raw_clone(SIGCHLD|
1605                                 (FLAGS_SET(flags, FORK_NEW_MOUNTNS) ? CLONE_NEWNS : 0) |
1606                                 (FLAGS_SET(flags, FORK_NEW_USERNS) ? CLONE_NEWUSER : 0) |
1607                                 (FLAGS_SET(flags, FORK_NEW_NETNS) ? CLONE_NEWNET : 0) |
1608                                 (FLAGS_SET(flags, FORK_NEW_PIDNS) ? CLONE_NEWPID : 0));
1609         else
1610                 pid = fork();
1611         if (pid < 0)
1612                 return log_full_errno(prio, errno, "Failed to fork off '%s': %m", strna(name));
1613         if (pid > 0) {
1614
1615                 /* If we are in the intermediary process, exit now */
1616                 if (intermediary) {
1617                         if (pidref_transport_fds[1] >= 0) {
1618                                 _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
1619
1620                                 r = pidref_set_pid(&pidref, pid);
1621                                 if (r < 0) {
1622                                         log_full_errno(prio, r, "Failed to open reference to PID "PID_FMT": %m", pid);
1623                                         _exit(EXIT_FAILURE);
1624                                 }
1625
1626                                 r = send_one_fd_iov(
1627                                                 pidref_transport_fds[1],
1628                                                 pidref.fd,
1629                                                 &IOVEC_MAKE(&pidref.pid, sizeof(pidref.pid)),
1630                                                 /* iovlen= */ 1,
1631                                                 /* flags= */ 0);
1632                                 if (r < 0) {
1633                                         log_full_errno(prio, r, "Failed to send child pidref: %m");
1634                                         _exit(EXIT_FAILURE);
1635                                 }
1636                         }
1637
1638                         _exit(EXIT_SUCCESS);
1639                 }
1640
1641                 /* We are in the parent process */
1642                 log_debug("Successfully forked off '%s' as PID " PID_FMT ".", strna(name), pid);
1643
1644                 if (flags & FORK_WAIT) {
1645                         if (block_all) {
1646                                 /* undo everything except SIGCHLD */
1647                                 ss = saved_ss;
1648                                 assert_se(sigaddset(&ss, SIGCHLD) >= 0);
1649                                 (void) sigprocmask(SIG_SETMASK, &ss, NULL);
1650                         }
1651
1652                         r = wait_for_terminate_and_check(name, pid, (flags & FORK_LOG ? WAIT_LOG : 0));
1653                         if (r < 0)
1654                                 return r;
1655                         if (r != EXIT_SUCCESS) /* exit status > 0 should be treated as failure, too */
1656                                 return -EPROTO;
1657
1658                         /* If we are in the parent and successfully waited, then the process doesn't exist anymore. */
1659                         if (ret_pid)
1660                                 *ret_pid = PIDREF_NULL;
1661
1662                         return 1;
1663                 }
1664
1665                 if (ret_pid) {
1666                         if (FLAGS_SET(flags, FORK_PID_ONLY))
1667                                 *ret_pid = PIDREF_MAKE_FROM_PID(pid);
1668                         else {
1669                                 r = pidref_set_pid(ret_pid, pid);
1670                                 if (r < 0) /* Let's not fail for this, no matter what, the process exists after all, and that's key */
1671                                         *ret_pid = PIDREF_MAKE_FROM_PID(pid);
1672                         }
1673                 }
1674
1675                 return 1;
1676         }
1677
1678         /* We are in the child process */
1679
1680         pidref_transport_fds[1] = safe_close(pidref_transport_fds[1]);
1681
1682         /* Restore signal mask manually */
1683         saved_ssp = NULL;
1684
1685         if (flags & FORK_REOPEN_LOG) {
1686                 /* Close the logs if requested, before we log anything. And make sure we reopen it if needed. */
1687                 log_close();
1688                 log_set_open_when_needed(true);
1689                 log_settle_target();
1690         }
1691
1692         if (name) {
1693                 r = rename_process(name);
1694                 if (r < 0)
1695                         log_full_errno(flags & FORK_LOG ? LOG_WARNING : LOG_DEBUG,
1696                                        r, "Failed to rename process, ignoring: %m");
1697         }
1698
1699         if (flags & (FORK_DEATHSIG_SIGTERM|FORK_DEATHSIG_SIGINT|FORK_DEATHSIG_SIGKILL))
1700                 if (prctl(PR_SET_PDEATHSIG, fork_flags_to_signal(flags)) < 0) {
1701                         log_full_errno(prio, errno, "Failed to set death signal: %m");
1702                         _exit(EXIT_FAILURE);
1703                 }
1704
1705         if (flags & FORK_RESET_SIGNALS) {
1706                 r = reset_all_signal_handlers();
1707                 if (r < 0) {
1708                         log_full_errno(prio, r, "Failed to reset signal handlers: %m");
1709                         _exit(EXIT_FAILURE);
1710                 }
1711
1712                 /* This implicitly undoes the signal mask stuff we did before the fork()ing above */
1713                 r = reset_signal_mask();
1714                 if (r < 0) {
1715                         log_full_errno(prio, r, "Failed to reset signal mask: %m");
1716                         _exit(EXIT_FAILURE);
1717                 }
1718         } else if (block_signals) { /* undo what we did above */
1719                 if (sigprocmask(SIG_SETMASK, &saved_ss, NULL) < 0) {
1720                         log_full_errno(prio, errno, "Failed to restore signal mask: %m");
1721                         _exit(EXIT_FAILURE);
1722                 }
1723         }
1724
1725         if (flags & (FORK_DEATHSIG_SIGTERM|FORK_DEATHSIG_SIGKILL|FORK_DEATHSIG_SIGINT)) {
1726                 pid_t ppid;
1727                 /* Let's see if the parent PID is still the one we started from? If not, then the parent
1728                  * already died by the time we set PR_SET_PDEATHSIG, hence let's emulate the effect */
1729
1730                 ppid = getppid();
1731                 if (ppid == 0)
1732                         /* Parent is in a different PID namespace. */;
1733                 else if (ppid != original_pid) {
1734                         int sig = fork_flags_to_signal(flags);
1735                         log_debug("Parent died early, raising %s.", signal_to_string(sig));
1736                         (void) raise(sig);
1737                         _exit(EXIT_FAILURE);
1738                 }
1739         }
1740
1741         if (FLAGS_SET(flags, FORK_NEW_MOUNTNS | FORK_MOUNTNS_SLAVE)) {
1742                 /* Optionally, make sure we never propagate mounts to the host. */
1743                 if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) < 0) {
1744                         log_full_errno(prio, errno, "Failed to remount root directory as MS_SLAVE: %m");
1745                         _exit(EXIT_FAILURE);
1746                 }
1747         }
1748
1749         if (FLAGS_SET(flags, FORK_PRIVATE_TMP)) {
1750                 assert(FLAGS_SET(flags, FORK_NEW_MOUNTNS));
1751
1752                 /* Optionally, overmount new tmpfs instance on /tmp/. */
1753                 r = mount_nofollow("tmpfs", "/tmp", "tmpfs",
1754                                    MS_NOSUID|MS_NODEV,
1755                                    "mode=01777" TMPFS_LIMITS_RUN);
1756                 if (r < 0) {
1757                         log_full_errno(prio, r, "Failed to overmount /tmp/: %m");
1758                         _exit(EXIT_FAILURE);
1759                 }
1760         }
1761
1762         if (flags & FORK_REARRANGE_STDIO) {
1763                 if (stdio_fds) {
1764                         r = rearrange_stdio(stdio_fds[0], stdio_fds[1], stdio_fds[2]);
1765                         if (r < 0) {
1766                                 log_full_errno(prio, r, "Failed to rearrange stdio fds: %m");
1767                                 _exit(EXIT_FAILURE);
1768                         }
1769
1770                         /* Turn off O_NONBLOCK on the fdio fds, in case it was left on */
1771                         stdio_disable_nonblock();
1772                 } else {
1773                         r = make_null_stdio();
1774                         if (r < 0) {
1775                                 log_full_errno(prio, r, "Failed to connect stdin/stdout to /dev/null: %m");
1776                                 _exit(EXIT_FAILURE);
1777                         }
1778                 }
1779         } else if (flags & FORK_STDOUT_TO_STDERR) {
1780                 if (dup2(STDERR_FILENO, STDOUT_FILENO) < 0) {
1781                         log_full_errno(prio, errno, "Failed to connect stdout to stderr: %m");
1782                         _exit(EXIT_FAILURE);
1783                 }
1784         }
1785
1786         if (flags & FORK_CLOSE_ALL_FDS) {
1787                 /* Close the logs here in case it got reopened above, as close_all_fds() would close them for us */
1788                 log_close();
1789
1790                 r = close_all_fds(except_fds, n_except_fds);
1791                 if (r < 0) {
1792                         log_full_errno(prio, r, "Failed to close all file descriptors: %m");
1793                         _exit(EXIT_FAILURE);
1794                 }
1795         }
1796
1797         if (flags & FORK_PACK_FDS) {
1798                 /* FORK_CLOSE_ALL_FDS ensures that except_fds are the only FDs >= 3 that are
1799                  * open, this is including the log. This is required by pack_fds, which will
1800                  * get stuck in an infinite loop of any FDs other than except_fds are open. */
1801                 assert(FLAGS_SET(flags, FORK_CLOSE_ALL_FDS));
1802
1803                 r = pack_fds(except_fds, n_except_fds);
1804                 if (r < 0) {
1805                         log_full_errno(prio, r, "Failed to pack file descriptors: %m");
1806                         _exit(EXIT_FAILURE);
1807                 }
1808         }
1809
1810         if (flags & FORK_CLOEXEC_OFF) {
1811                 r = fd_cloexec_many(except_fds, n_except_fds, false);
1812                 if (r < 0) {
1813                         log_full_errno(prio, r, "Failed to turn off O_CLOEXEC on file descriptors: %m");
1814                         _exit(EXIT_FAILURE);
1815                 }
1816         }
1817
1818         /* When we were asked to reopen the logs, do so again now */
1819         if (flags & FORK_REOPEN_LOG) {
1820                 log_open();
1821                 log_set_open_when_needed(false);
1822         }
1823
1824         if (flags & FORK_RLIMIT_NOFILE_SAFE) {
1825                 r = rlimit_nofile_safe();
1826                 if (r < 0) {
1827                         log_full_errno(prio, r, "Failed to lower RLIMIT_NOFILE's soft limit to 1K: %m");
1828                         _exit(EXIT_FAILURE);
1829                 }
1830         }
1831
1832         if (!FLAGS_SET(flags, FORK_KEEP_NOTIFY_SOCKET)) {
1833                 r = RET_NERRNO(unsetenv("NOTIFY_SOCKET"));
1834                 if (r < 0) {
1835                         log_full_errno(prio, r, "Failed to unset $NOTIFY_SOCKET: %m");
1836                         _exit(EXIT_FAILURE);
1837                 }
1838         }
1839
1840         if (FLAGS_SET(flags, FORK_FREEZE))
1841                 freeze();
1842
1843         if (ret_pid) {
1844                 if (FLAGS_SET(flags, FORK_PID_ONLY))
1845                         *ret_pid = PIDREF_MAKE_FROM_PID(getpid_cached());
1846                 else {
1847                         r = pidref_set_self(ret_pid);
1848                         if (r < 0) {
1849                                 log_full_errno(prio, r, "Failed to acquire PID reference on ourselves: %m");
1850                                 _exit(EXIT_FAILURE);
1851                         }
1852                 }
1853         }
1854
1855         return 0;
1856 }
1857
1858 int safe_fork_full(
1859                 const char *name,
1860                 const int stdio_fds[3],
1861                 int except_fds[],
1862                 size_t n_except_fds,
1863                 ForkFlags flags,
1864                 pid_t *ret_pid) {
1865
1866         _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
1867         int r;
1868
1869         /* Getting the detached child process pid without pidfd is racy, so don't allow it if not returning
1870          * a pidref to the caller. */
1871         assert(!FLAGS_SET(flags, FORK_DETACH) || !ret_pid);
1872
1873         r = pidref_safe_fork_full(name, stdio_fds, except_fds, n_except_fds, flags|FORK_PID_ONLY, ret_pid ? &pidref : NULL);
1874         if (r < 0 || !ret_pid)
1875                 return r;
1876
1877         *ret_pid = pidref.pid;
1878
1879         return r;
1880 }
1881
1882 int namespace_fork(
1883                 const char *outer_name,
1884                 const char *inner_name,
1885                 int except_fds[],
1886                 size_t n_except_fds,
1887                 ForkFlags flags,
1888                 int pidns_fd,
1889                 int mntns_fd,
1890                 int netns_fd,
1891                 int userns_fd,
1892                 int root_fd,
1893                 pid_t *ret_pid) {
1894
1895         int r;
1896
1897         /* This is much like safe_fork(), but forks twice, and joins the specified namespaces in the middle
1898          * process. This ensures that we are fully a member of the destination namespace, with pidns an all, so that
1899          * /proc/self/fd works correctly. */
1900
1901         r = safe_fork_full(outer_name,
1902                            NULL,
1903                            except_fds, n_except_fds,
1904                            (flags|FORK_DEATHSIG_SIGINT|FORK_DEATHSIG_SIGTERM|FORK_DEATHSIG_SIGKILL) & ~(FORK_REOPEN_LOG|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE), ret_pid);
1905         if (r < 0)
1906                 return r;
1907         if (r == 0) {
1908                 pid_t pid;
1909
1910                 /* Child */
1911
1912                 r = namespace_enter(pidns_fd, mntns_fd, netns_fd, userns_fd, root_fd);
1913                 if (r < 0) {
1914                         log_full_errno(FLAGS_SET(flags, FORK_LOG) ? LOG_ERR : LOG_DEBUG, r, "Failed to join namespace: %m");
1915                         _exit(EXIT_FAILURE);
1916                 }
1917
1918                 /* We mask a few flags here that either make no sense for the grandchild, or that we don't have to do again */
1919                 r = safe_fork_full(inner_name,
1920                                    NULL,
1921                                    except_fds, n_except_fds,
1922                                    flags & ~(FORK_WAIT|FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_REARRANGE_STDIO), &pid);
1923                 if (r < 0)
1924                         _exit(EXIT_FAILURE);
1925                 if (r == 0) {
1926                         /* Child */
1927                         if (ret_pid)
1928                                 *ret_pid = pid;
1929                         return 0;
1930                 }
1931
1932                 r = wait_for_terminate_and_check(inner_name, pid, FLAGS_SET(flags, FORK_LOG) ? WAIT_LOG : 0);
1933                 if (r < 0)
1934                         _exit(EXIT_FAILURE);
1935
1936                 _exit(r);
1937         }
1938
1939         return 1;
1940 }
1941
1942 int set_oom_score_adjust(int value) {
1943         char t[DECIMAL_STR_MAX(int)];
1944
1945         if (!oom_score_adjust_is_valid(value))
1946                 return -EINVAL;
1947
1948         xsprintf(t, "%i", value);
1949
1950         return write_string_file("/proc/self/oom_score_adj", t,
1951                                  WRITE_STRING_FILE_VERIFY_ON_FAILURE|WRITE_STRING_FILE_DISABLE_BUFFER);
1952 }
1953
1954 int get_oom_score_adjust(int *ret) {
1955         _cleanup_free_ char *t = NULL;
1956         int r, a;
1957
1958         r = read_virtual_file("/proc/self/oom_score_adj", SIZE_MAX, &t, NULL);
1959         if (r < 0)
1960                 return r;
1961
1962         delete_trailing_chars(t, WHITESPACE);
1963
1964         r = safe_atoi(t, &a);
1965         if (r < 0)
1966                 return r;
1967
1968         if (!oom_score_adjust_is_valid(a))
1969                 return -ENODATA;
1970
1971         if (ret)
1972                 *ret = a;
1973
1974         return 0;
1975 }
1976
1977 static int rlimit_to_nice(rlim_t limit) {
1978         if (limit <= 1)
1979                 return PRIO_MAX-1; /* i.e. 19 */
1980
1981         if (limit >= -PRIO_MIN + PRIO_MAX)
1982                 return PRIO_MIN; /* i.e. -20 */
1983
1984         return PRIO_MAX - (int) limit;
1985 }
1986
1987 int setpriority_closest(int priority) {
1988         struct rlimit highest;
1989         int r, current, limit;
1990
1991         /* Try to set requested nice level */
1992         r = RET_NERRNO(setpriority(PRIO_PROCESS, 0, priority));
1993         if (r >= 0)
1994                 return 1;
1995         if (!ERRNO_IS_NEG_PRIVILEGE(r))
1996                 return r;
1997
1998         errno = 0;
1999         current = getpriority(PRIO_PROCESS, 0);
2000         if (errno != 0)
2001                 return -errno;
2002
2003         if (priority == current)
2004                 return 1;
2005
2006        /* Hmm, we'd expect that raising the nice level from our status quo would always work. If it doesn't,
2007         * then the whole setpriority() system call is blocked to us, hence let's propagate the error
2008         * right-away */
2009         if (priority > current)
2010                 return r;
2011
2012         if (getrlimit(RLIMIT_NICE, &highest) < 0)
2013                 return -errno;
2014
2015         limit = rlimit_to_nice(highest.rlim_cur);
2016
2017         /* Push to the allowed limit if we're higher than that. Note that we could also be less nice than
2018          * limit allows us, but still higher than what's requested. In that case our current value is
2019          * the best choice. */
2020         if (current > limit)
2021                 if (setpriority(PRIO_PROCESS, 0, limit) < 0)
2022                         return -errno;
2023
2024         log_debug("Cannot set requested nice level (%i), using next best (%i).", priority, MIN(current, limit));
2025         return 0;
2026 }
2027
2028 _noreturn_ void freeze(void) {
2029         log_close();
2030
2031         /* Make sure nobody waits for us (i.e. on one of our sockets) anymore. Note that we use
2032          * close_all_fds_without_malloc() instead of plain close_all_fds() here, since we want this function
2033          * to be compatible with being called from signal handlers. */
2034         (void) close_all_fds_without_malloc(NULL, 0);
2035
2036         /* Let's not freeze right away, but keep reaping zombies. */
2037         for (;;) {
2038                 siginfo_t si = {};
2039
2040                 if (waitid(P_ALL, 0, &si, WEXITED) < 0 && errno != EINTR)
2041                         break;
2042         }
2043
2044         /* waitid() failed with an ECHLD error (because there are no left-over child processes) or any other
2045          * (unexpected) error. Freeze for good now! */
2046         for (;;)
2047                 pause();
2048 }
2049
2050 int get_process_threads(pid_t pid) {
2051         _cleanup_free_ char *t = NULL;
2052         int n, r;
2053
2054         if (pid < 0)
2055                 return -EINVAL;
2056
2057         r = procfs_file_get_field(pid, "status", "Threads", &t);
2058         if (r == -ENOENT)
2059                 return -ESRCH;
2060         if (r < 0)
2061                 return r;
2062
2063         r = safe_atoi(t, &n);
2064         if (r < 0)
2065                 return r;
2066         if (n < 0)
2067                 return -EINVAL;
2068
2069         return n;
2070 }
2071
2072 int is_reaper_process(void) {
2073         int b = 0;
2074
2075         /* Checks if we are running in a reaper process, i.e. if we are expected to deal with processes
2076          * reparented to us. This simply checks if we are PID 1 or if PR_SET_CHILD_SUBREAPER was called. */
2077
2078         if (getpid_cached() == 1)
2079                 return true;
2080
2081         if (prctl(PR_GET_CHILD_SUBREAPER, (unsigned long) &b, 0UL, 0UL, 0UL) < 0)
2082                 return -errno;
2083
2084         return b != 0;
2085 }
2086
2087 int make_reaper_process(bool b) {
2088
2089         if (getpid_cached() == 1) {
2090
2091                 if (!b)
2092                         return -EINVAL;
2093
2094                 return 0;
2095         }
2096
2097         /* Some prctl()s insist that all 5 arguments are specified, others do not. Let's always specify all,
2098          * to avoid any ambiguities */
2099         if (prctl(PR_SET_CHILD_SUBREAPER, (unsigned long) b, 0UL, 0UL, 0UL) < 0)
2100                 return -errno;
2101
2102         return 0;
2103 }
2104
2105 DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(posix_spawnattr_t*, posix_spawnattr_destroy, NULL);
2106
2107 int posix_spawn_wrapper(
2108                 const char *path,
2109                 char * const *argv,
2110                 char * const *envp,
2111                 const char *cgroup,
2112                 PidRef *ret_pidref) {
2113
2114         short flags = POSIX_SPAWN_SETSIGMASK;
2115         posix_spawnattr_t attr;
2116         sigset_t mask;
2117         int r;
2118
2119         /* Forks and invokes 'path' with 'argv' and 'envp' using CLONE_VM and CLONE_VFORK, which means the
2120          * caller will be blocked until the child either exits or exec's. The memory of the child will be
2121          * fully shared with the memory of the parent, so that there are no copy-on-write or memory.max
2122          * issues.
2123          *
2124          * Also, move the newly-created process into 'cgroup' through POSIX_SPAWN_SETCGROUP (clone3())
2125          * if available.
2126          * returns 1: We're already in the right cgroup
2127          *         0: 'cgroup' not specified or POSIX_SPAWN_SETCGROUP is not supported. The caller
2128          *            needs to call 'cg_attach' on their own */
2129
2130         assert(path);
2131         assert(argv);
2132         assert(ret_pidref);
2133
2134         assert_se(sigfillset(&mask) >= 0);
2135
2136         r = posix_spawnattr_init(&attr);
2137         if (r != 0)
2138                 return -r; /* These functions return a positive errno on failure */
2139
2140         /* Initialization needs to succeed before we can set up a destructor. */
2141         _unused_ _cleanup_(posix_spawnattr_destroyp) posix_spawnattr_t *attr_destructor = &attr;
2142
2143 #if HAVE_PIDFD_SPAWN
2144         static bool have_clone_into_cgroup = true; /* kernel 5.7+ */
2145         _cleanup_close_ int cgroup_fd = -EBADF;
2146
2147         if (cgroup && have_clone_into_cgroup) {
2148                 _cleanup_free_ char *resolved_cgroup = NULL;
2149
2150                 r = cg_get_path_and_check(
2151                                 SYSTEMD_CGROUP_CONTROLLER,
2152                                 cgroup,
2153                                 /* suffix= */ NULL,
2154                                 &resolved_cgroup);
2155                 if (r < 0)
2156                         return r;
2157
2158                 cgroup_fd = open(resolved_cgroup, O_PATH|O_DIRECTORY|O_CLOEXEC);
2159                 if (cgroup_fd < 0)
2160                         return -errno;
2161
2162                 r = posix_spawnattr_setcgroup_np(&attr, cgroup_fd);
2163                 if (r != 0)
2164                         return -r;
2165
2166                 flags |= POSIX_SPAWN_SETCGROUP;
2167         }
2168 #endif
2169
2170         r = posix_spawnattr_setflags(&attr, flags);
2171         if (r != 0)
2172                 return -r;
2173         r = posix_spawnattr_setsigmask(&attr, &mask);
2174         if (r != 0)
2175                 return -r;
2176
2177 #if HAVE_PIDFD_SPAWN
2178         _cleanup_close_ int pidfd = -EBADF;
2179
2180         r = pidfd_spawn(&pidfd, path, NULL, &attr, argv, envp);
2181         if (ERRNO_IS_NOT_SUPPORTED(r) && FLAGS_SET(flags, POSIX_SPAWN_SETCGROUP) && cg_is_threaded(cgroup) > 0)
2182                 return -EUCLEAN; /* clone3() could also return EOPNOTSUPP if the target cgroup is in threaded mode,
2183                                     turn that into something recognizable */
2184         if ((ERRNO_IS_NOT_SUPPORTED(r) || ERRNO_IS_PRIVILEGE(r) || r == E2BIG) &&
2185             FLAGS_SET(flags, POSIX_SPAWN_SETCGROUP)) {
2186                 /* Compiled on a newer host, or seccomp&friends blocking clone3()? Fallback, but
2187                  * need to disable POSIX_SPAWN_SETCGROUP, which is what redirects to clone3().
2188                  * Note that we might get E2BIG here since some kernels (e.g. 5.4) support clone3()
2189                  * but not CLONE_INTO_CGROUP. */
2190
2191                 /* CLONE_INTO_CGROUP definitely won't work, hence remember the fact so that we don't
2192                  * retry every time. */
2193                 have_clone_into_cgroup = false;
2194
2195                 flags &= ~POSIX_SPAWN_SETCGROUP;
2196                 r = posix_spawnattr_setflags(&attr, flags);
2197                 if (r != 0)
2198                         return -r;
2199
2200                 r = pidfd_spawn(&pidfd, path, NULL, &attr, argv, envp);
2201         }
2202         if (r != 0)
2203                 return -r;
2204
2205         r = pidref_set_pidfd_consume(ret_pidref, TAKE_FD(pidfd));
2206         if (r < 0)
2207                 return r;
2208
2209         return FLAGS_SET(flags, POSIX_SPAWN_SETCGROUP);
2210 #else
2211         pid_t pid;
2212
2213         r = posix_spawn(&pid, path, NULL, &attr, argv, envp);
2214         if (r != 0)
2215                 return -r;
2216
2217         r = pidref_set_pid(ret_pidref, pid);
2218         if (r < 0)
2219                 return r;
2220
2221         return 0; /* We did not use CLONE_INTO_CGROUP so return 0, the caller will have to move the child */
2222 #endif
2223 }
2224
2225 int proc_dir_open(DIR **ret) {
2226         DIR *d;
2227
2228         assert(ret);
2229
2230         d = opendir("/proc");
2231         if (!d)
2232                 return -errno;
2233
2234         *ret = d;
2235         return 0;
2236 }
2237
2238 int proc_dir_read(DIR *d, pid_t *ret) {
2239         assert(d);
2240
2241         for (;;) {
2242                 struct dirent *de;
2243
2244                 errno = 0;
2245                 de = readdir_no_dot(d);
2246                 if (!de) {
2247                         if (errno != 0)
2248                                 return -errno;
2249
2250                         break;
2251                 }
2252
2253                 if (!IN_SET(de->d_type, DT_DIR, DT_UNKNOWN))
2254                         continue;
2255
2256                 if (parse_pid(de->d_name, ret) >= 0)
2257                         return 1;
2258         }
2259
2260         if (ret)
2261                 *ret = 0;
2262         return 0;
2263 }
2264
2265 int proc_dir_read_pidref(DIR *d, PidRef *ret) {
2266         int r;
2267
2268         assert(d);
2269
2270         for (;;) {
2271                 pid_t pid;
2272
2273                 r = proc_dir_read(d, &pid);
2274                 if (r < 0)
2275                         return r;
2276                 if (r == 0)
2277                         break;
2278
2279                 r = pidref_set_pid(ret, pid);
2280                 if (r == -ESRCH) /* gone by now? skip it */
2281                         continue;
2282                 if (r < 0)
2283                         return r;
2284
2285                 return 1;
2286         }
2287
2288         if (ret)
2289                 *ret = PIDREF_NULL;
2290         return 0;
2291 }
2292
2293 static const char *const sigchld_code_table[] = {
2294         [CLD_EXITED] = "exited",
2295         [CLD_KILLED] = "killed",
2296         [CLD_DUMPED] = "dumped",
2297         [CLD_TRAPPED] = "trapped",
2298         [CLD_STOPPED] = "stopped",
2299         [CLD_CONTINUED] = "continued",
2300 };
2301
2302 DEFINE_STRING_TABLE_LOOKUP(sigchld_code, int);
2303
2304 static const char* const sched_policy_table[] = {
2305         [SCHED_OTHER] = "other",
2306         [SCHED_BATCH] = "batch",
2307         [SCHED_IDLE] = "idle",
2308         [SCHED_FIFO] = "fifo",
2309         [SCHED_RR] = "rr",
2310 };
2311
2312 DEFINE_STRING_TABLE_LOOKUP_WITH_FALLBACK(sched_policy, int, INT_MAX);
2313
2314 _noreturn_ void report_errno_and_exit(int errno_fd, int error) {
2315         int r;
2316
2317         if (error >= 0)
2318                 _exit(EXIT_SUCCESS);
2319
2320         assert(errno_fd >= 0);
2321
2322         r = loop_write(errno_fd, &error, sizeof(error));
2323         if (r < 0)
2324                 log_debug_errno(r, "Failed to write errno to errno_fd=%d: %m", errno_fd);
2325
2326         _exit(EXIT_FAILURE);
2327 }
2328
2329 int read_errno(int errno_fd) {
2330         int r;
2331
2332         assert(errno_fd >= 0);
2333
2334         /* The issue here is that it's impossible to distinguish between an error code returned by child and
2335          * IO error arose when reading it. So, the function logs errors and return EIO for the later case. */
2336
2337         ssize_t n = loop_read(errno_fd, &r, sizeof(r), /* do_poll = */ false);
2338         if (n < 0) {
2339                 log_debug_errno(n, "Failed to read errno: %m");
2340                 return -EIO;
2341         }
2342         if (n == sizeof(r)) {
2343                 if (r == 0)
2344                         return 0;
2345                 if (r < 0) /* child process reported an error, return it */
2346                         return log_debug_errno(r, "Child process failed with errno: %m");
2347                 return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Received an errno, but it's a positive value.");
2348         }
2349         if (n != 0)
2350                 return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Received unexpected amount of bytes while reading errno.");
2351
2352         /* the process exited without reporting an error, assuming success */
2353         return 0;
2354 }