src/basic/process-util.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2 /***
   3   This file is part of systemd.
   4
   5   Copyright 2010 Lennart Poettering
   6
   7   systemd is free software; you can redistribute it and/or modify it
   8   under the terms of the GNU Lesser General Public License as published by
   9   the Free Software Foundation; either version 2.1 of the License, or
  10   (at your option) any later version.
  11
  12   systemd is distributed in the hope that it will be useful, but
  13   WITHOUT ANY WARRANTY; without even the implied warranty of
  14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15   Lesser General Public License for more details.
  16
  17   You should have received a copy of the GNU Lesser General Public License
  18   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  19 ***/
  20
  21 #include <ctype.h>
  22 #include <errno.h>
  23 #include <limits.h>
  24 #include <linux/oom.h>
  25 #include <sched.h>
  26 #include <signal.h>
  27 #include <stdbool.h>
  28 #include <stdio.h>
  29 #include <stdio_ext.h>
  30 #include <stdlib.h>
  31 #include <string.h>
  32 #include <sys/mman.h>
  33 #include <sys/personality.h>
  34 #include <sys/prctl.h>
  35 #include <sys/types.h>
  36 #include <sys/wait.h>
  37 #include <syslog.h>
  38 #include <unistd.h>
  39 #if HAVE_VALGRIND_VALGRIND_H
  40 #include <valgrind/valgrind.h>
  41 #endif
  42
  43 #include "alloc-util.h"
  44 #include "architecture.h"
  45 #include "escape.h"
  46 #include "fd-util.h"
  47 #include "fileio.h"
  48 #include "fs-util.h"
  49 #include "ioprio.h"
  50 #include "log.h"
  51 #include "macro.h"
  52 #include "missing.h"
  53 #include "process-util.h"
  54 #include "raw-clone.h"
  55 #include "signal-util.h"
  56 #include "stat-util.h"
  57 #include "string-table.h"
  58 #include "string-util.h"
  59 #include "terminal-util.h"
  60 #include "user-util.h"
  61 #include "util.h"
  62
  63 int get_process_state(pid_t pid) {
  64         const char *p;
  65         char state;
  66         int r;
  67         _cleanup_free_ char *line = NULL;
  68
  69         assert(pid >= 0);
  70
  71         p = procfs_file_alloca(pid, "stat");
  72
  73         r = read_one_line_file(p, &line);
  74         if (r == -ENOENT)
  75                 return -ESRCH;
  76         if (r < 0)
  77                 return r;
  78
  79         p = strrchr(line, ')');
  80         if (!p)
  81                 return -EIO;
  82
  83         p++;
  84
  85         if (sscanf(p, " %c", &state) != 1)
  86                 return -EIO;
  87
  88         return (unsigned char) state;
  89 }
  90
  91 int get_process_comm(pid_t pid, char **name) {
  92         const char *p;
  93         int r;
  94
  95         assert(name);
  96         assert(pid >= 0);
  97
  98         p = procfs_file_alloca(pid, "comm");
  99
 100         r = read_one_line_file(p, name);
 101         if (r == -ENOENT)
 102                 return -ESRCH;
 103
 104         return r;
 105 }
 106
 107 int get_process_cmdline(pid_t pid, size_t max_length, bool comm_fallback, char **line) {
 108         _cleanup_fclose_ FILE *f = NULL;
 109         bool space = false;
 110         char *k, *ans = NULL;
 111         const char *p;
 112         int c;
 113
 114         assert(line);
 115         assert(pid >= 0);
 116
 117         /* Retrieves a process' command line. Replaces unprintable characters while doing so by whitespace (coalescing
 118          * multiple sequential ones into one). If max_length is != 0 will return a string of the specified size at most
 119          * (the trailing NUL byte does count towards the length here!), abbreviated with a "..." ellipsis. If
 120          * comm_fallback is true and the process has no command line set (the case for kernel threads), or has a
 121          * command line that resolves to the empty string will return the "comm" name of the process instead.
 122          *
 123          * Returns -ESRCH if the process doesn't exist, and -ENOENT if the process has no command line (and
 124          * comm_fallback is false). Returns 0 and sets *line otherwise. */
 125
 126         p = procfs_file_alloca(pid, "cmdline");
 127
 128         f = fopen(p, "re");
 129         if (!f) {
 130                 if (errno == ENOENT)
 131                         return -ESRCH;
 132                 return -errno;
 133         }
 134
 135         (void) __fsetlocking(f, FSETLOCKING_BYCALLER);
 136
 137         if (max_length == 1) {
 138
 139                 /* If there's only room for one byte, return the empty string */
 140                 ans = new0(char, 1);
 141                 if (!ans)
 142                         return -ENOMEM;
 143
 144                 *line = ans;
 145                 return 0;
 146
 147         } else if (max_length == 0) {
 148                 size_t len = 0, allocated = 0;
 149
 150                 while ((c = getc(f)) != EOF) {
 151
 152                         if (!GREEDY_REALLOC(ans, allocated, len+3)) {
 153                                 free(ans);
 154                                 return -ENOMEM;
 155                         }
 156
 157                         if (isprint(c)) {
 158                                 if (space) {
 159                                         ans[len++] = ' ';
 160                                         space = false;
 161                                 }
 162
 163                                 ans[len++] = c;
 164                         } else if (len > 0)
 165                                 space = true;
 166                }
 167
 168                 if (len > 0)
 169                         ans[len] = '\0';
 170                 else
 171                         ans = mfree(ans);
 172
 173         } else {
 174                 bool dotdotdot = false;
 175                 size_t left;
 176
 177                 ans = new(char, max_length);
 178                 if (!ans)
 179                         return -ENOMEM;
 180
 181                 k = ans;
 182                 left = max_length;
 183                 while ((c = getc(f)) != EOF) {
 184
 185                         if (isprint(c)) {
 186
 187                                 if (space) {
 188                                         if (left <= 2) {
 189                                                 dotdotdot = true;
 190                                                 break;
 191                                         }
 192
 193                                         *(k++) = ' ';
 194                                         left--;
 195                                         space = false;
 196                                 }
 197
 198                                 if (left <= 1) {
 199                                         dotdotdot = true;
 200                                         break;
 201                                 }
 202
 203                                 *(k++) = (char) c;
 204                                 left--;
 205                         } else if (k > ans)
 206                                 space = true;
 207                 }
 208
 209                 if (dotdotdot) {
 210                         if (max_length <= 4) {
 211                                 k = ans;
 212                                 left = max_length;
 213                         } else {
 214                                 k = ans + max_length - 4;
 215                                 left = 4;
 216
 217                                 /* Eat up final spaces */
 218                                 while (k > ans && isspace(k[-1])) {
 219                                         k--;
 220                                         left++;
 221                                 }
 222                         }
 223
 224                         strncpy(k, "...", left-1);
 225                         k[left-1] = 0;
 226                 } else
 227                         *k = 0;
 228         }
 229
 230         /* Kernel threads have no argv[] */
 231         if (isempty(ans)) {
 232                 _cleanup_free_ char *t = NULL;
 233                 int h;
 234
 235                 free(ans);
 236
 237                 if (!comm_fallback)
 238                         return -ENOENT;
 239
 240                 h = get_process_comm(pid, &t);
 241                 if (h < 0)
 242                         return h;
 243
 244                 if (max_length == 0)
 245                         ans = strjoin("[", t, "]");
 246                 else {
 247                         size_t l;
 248
 249                         l = strlen(t);
 250
 251                         if (l + 3 <= max_length)
 252                                 ans = strjoin("[", t, "]");
 253                         else if (max_length <= 6) {
 254
 255                                 ans = new(char, max_length);
 256                                 if (!ans)
 257                                         return -ENOMEM;
 258
 259                                 memcpy(ans, "[...]", max_length-1);
 260                                 ans[max_length-1] = 0;
 261                         } else {
 262                                 char *e;
 263
 264                                 t[max_length - 6] = 0;
 265
 266                                 /* Chop off final spaces */
 267                                 e = strchr(t, 0);
 268                                 while (e > t && isspace(e[-1]))
 269                                         e--;
 270                                 *e = 0;
 271
 272                                 ans = strjoin("[", t, "...]");
 273                         }
 274                 }
 275                 if (!ans)
 276                         return -ENOMEM;
 277         }
 278
 279         *line = ans;
 280         return 0;
 281 }
 282
 283 int rename_process(const char name[]) {
 284         static size_t mm_size = 0;
 285         static char *mm = NULL;
 286         bool truncated = false;
 287         size_t l;
 288
 289         /* This is a like a poor man's setproctitle(). It changes the comm field, argv[0], and also the glibc's
 290          * internally used name of the process. For the first one a limit of 16 chars applies; to the second one in
 291          * many cases one of 10 (i.e. length of "/sbin/init") — however if we have CAP_SYS_RESOURCES it is unbounded;
 292          * to the third one 7 (i.e. the length of "systemd". If you pass a longer string it will likely be
 293          * truncated.
 294          *
 295          * Returns 0 if a name was set but truncated, > 0 if it was set but not truncated. */
 296
 297         if (isempty(name))
 298                 return -EINVAL; /* let's not confuse users unnecessarily with an empty name */
 299
 300         if (!is_main_thread())
 301                 return -EPERM; /* Let's not allow setting the process name from other threads than the main one, as we
 302                                 * cache things without locking, and we make assumptions that PR_SET_NAME sets the
 303                                 * process name that isn't correct on any other threads */
 304
 305         l = strlen(name);
 306
 307         /* First step, change the comm field. The main thread's comm is identical to the process comm. This means we
 308          * can use PR_SET_NAME, which sets the thread name for the calling thread. */
 309         if (prctl(PR_SET_NAME, name) < 0)
 310                 log_debug_errno(errno, "PR_SET_NAME failed: %m");
 311         if (l > 15) /* Linux process names can be 15 chars at max */
 312                 truncated = true;
 313
 314         /* Second step, change glibc's ID of the process name. */
 315         if (program_invocation_name) {
 316                 size_t k;
 317
 318                 k = strlen(program_invocation_name);
 319                 strncpy(program_invocation_name, name, k);
 320                 if (l > k)
 321                         truncated = true;
 322         }
 323
 324         /* Third step, completely replace the argv[] array the kernel maintains for us. This requires privileges, but
 325          * has the advantage that the argv[] array is exactly what we want it to be, and not filled up with zeros at
 326          * the end. This is the best option for changing /proc/self/cmdline. */
 327
 328         /* Let's not bother with this if we don't have euid == 0. Strictly speaking we should check for the
 329          * CAP_SYS_RESOURCE capability which is independent of the euid. In our own code the capability generally is
 330          * present only for euid == 0, hence let's use this as quick bypass check, to avoid calling mmap() if
 331          * PR_SET_MM_ARG_{START,END} fails with EPERM later on anyway. After all geteuid() is dead cheap to call, but
 332          * mmap() is not. */
 333         if (geteuid() != 0)
 334                 log_debug("Skipping PR_SET_MM, as we don't have privileges.");
 335         else if (mm_size < l+1) {
 336                 size_t nn_size;
 337                 char *nn;
 338
 339                 nn_size = PAGE_ALIGN(l+1);
 340                 nn = mmap(NULL, nn_size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
 341                 if (nn == MAP_FAILED) {
 342                         log_debug_errno(errno, "mmap() failed: %m");
 343                         goto use_saved_argv;
 344                 }
 345
 346                 strncpy(nn, name, nn_size);
 347
 348                 /* Now, let's tell the kernel about this new memory */
 349                 if (prctl(PR_SET_MM, PR_SET_MM_ARG_START, (unsigned long) nn, 0, 0) < 0) {
 350                         log_debug_errno(errno, "PR_SET_MM_ARG_START failed, proceeding without: %m");
 351                         (void) munmap(nn, nn_size);
 352                         goto use_saved_argv;
 353                 }
 354
 355                 /* And update the end pointer to the new end, too. If this fails, we don't really know what to do, it's
 356                  * pretty unlikely that we can rollback, hence we'll just accept the failure, and continue. */
 357                 if (prctl(PR_SET_MM, PR_SET_MM_ARG_END, (unsigned long) nn + l + 1, 0, 0) < 0)
 358                         log_debug_errno(errno, "PR_SET_MM_ARG_END failed, proceeding without: %m");
 359
 360                 if (mm)
 361                         (void) munmap(mm, mm_size);
 362
 363                 mm = nn;
 364                 mm_size = nn_size;
 365         } else {
 366                 strncpy(mm, name, mm_size);
 367
 368                 /* Update the end pointer, continuing regardless of any failure. */
 369                 if (prctl(PR_SET_MM, PR_SET_MM_ARG_END, (unsigned long) mm + l + 1, 0, 0) < 0)
 370                         log_debug_errno(errno, "PR_SET_MM_ARG_END failed, proceeding without: %m");
 371         }
 372
 373 use_saved_argv:
 374         /* Fourth step: in all cases we'll also update the original argv[], so that our own code gets it right too if
 375          * it still looks here */
 376
 377         if (saved_argc > 0) {
 378                 int i;
 379
 380                 if (saved_argv[0]) {
 381                         size_t k;
 382
 383                         k = strlen(saved_argv[0]);
 384                         strncpy(saved_argv[0], name, k);
 385                         if (l > k)
 386                                 truncated = true;
 387                 }
 388
 389                 for (i = 1; i < saved_argc; i++) {
 390                         if (!saved_argv[i])
 391                                 break;
 392
 393                         memzero(saved_argv[i], strlen(saved_argv[i]));
 394                 }
 395         }
 396
 397         return !truncated;
 398 }
 399
 400 int is_kernel_thread(pid_t pid) {
 401         const char *p;
 402         size_t count;
 403         char c;
 404         bool eof;
 405         FILE *f;
 406
 407         if (IN_SET(pid, 0, 1) || pid == getpid_cached()) /* pid 1, and we ourselves certainly aren't a kernel thread */
 408                 return 0;
 409
 410         assert(pid > 1);
 411
 412         p = procfs_file_alloca(pid, "cmdline");
 413         f = fopen(p, "re");
 414         if (!f) {
 415                 if (errno == ENOENT)
 416                         return -ESRCH;
 417                 return -errno;
 418         }
 419
 420         (void) __fsetlocking(f, FSETLOCKING_BYCALLER);
 421
 422         count = fread(&c, 1, 1, f);
 423         eof = feof(f);
 424         fclose(f);
 425
 426         /* Kernel threads have an empty cmdline */
 427
 428         if (count <= 0)
 429                 return eof ? 1 : -errno;
 430
 431         return 0;
 432 }
 433
 434 int get_process_capeff(pid_t pid, char **capeff) {
 435         const char *p;
 436         int r;
 437
 438         assert(capeff);
 439         assert(pid >= 0);
 440
 441         p = procfs_file_alloca(pid, "status");
 442
 443         r = get_proc_field(p, "CapEff", WHITESPACE, capeff);
 444         if (r == -ENOENT)
 445                 return -ESRCH;
 446
 447         return r;
 448 }
 449
 450 static int get_process_link_contents(const char *proc_file, char **name) {
 451         int r;
 452
 453         assert(proc_file);
 454         assert(name);
 455
 456         r = readlink_malloc(proc_file, name);
 457         if (r == -ENOENT)
 458                 return -ESRCH;
 459         if (r < 0)
 460                 return r;
 461
 462         return 0;
 463 }
 464
 465 int get_process_exe(pid_t pid, char **name) {
 466         const char *p;
 467         char *d;
 468         int r;
 469
 470         assert(pid >= 0);
 471
 472         p = procfs_file_alloca(pid, "exe");
 473         r = get_process_link_contents(p, name);
 474         if (r < 0)
 475                 return r;
 476
 477         d = endswith(*name, " (deleted)");
 478         if (d)
 479                 *d = '\0';
 480
 481         return 0;
 482 }
 483
 484 static int get_process_id(pid_t pid, const char *field, uid_t *uid) {
 485         _cleanup_fclose_ FILE *f = NULL;
 486         char line[LINE_MAX];
 487         const char *p;
 488
 489         assert(field);
 490         assert(uid);
 491
 492         if (pid < 0)
 493                 return -EINVAL;
 494
 495         p = procfs_file_alloca(pid, "status");
 496         f = fopen(p, "re");
 497         if (!f) {
 498                 if (errno == ENOENT)
 499                         return -ESRCH;
 500                 return -errno;
 501         }
 502
 503         (void) __fsetlocking(f, FSETLOCKING_BYCALLER);
 504
 505         FOREACH_LINE(line, f, return -errno) {
 506                 char *l;
 507
 508                 l = strstrip(line);
 509
 510                 if (startswith(l, field)) {
 511                         l += strlen(field);
 512                         l += strspn(l, WHITESPACE);
 513
 514                         l[strcspn(l, WHITESPACE)] = 0;
 515
 516                         return parse_uid(l, uid);
 517                 }
 518         }
 519
 520         return -EIO;
 521 }
 522
 523 int get_process_uid(pid_t pid, uid_t *uid) {
 524
 525         if (pid == 0 || pid == getpid_cached()) {
 526                 *uid = getuid();
 527                 return 0;
 528         }
 529
 530         return get_process_id(pid, "Uid:", uid);
 531 }
 532
 533 int get_process_gid(pid_t pid, gid_t *gid) {
 534
 535         if (pid == 0 || pid == getpid_cached()) {
 536                 *gid = getgid();
 537                 return 0;
 538         }
 539
 540         assert_cc(sizeof(uid_t) == sizeof(gid_t));
 541         return get_process_id(pid, "Gid:", gid);
 542 }
 543
 544 int get_process_cwd(pid_t pid, char **cwd) {
 545         const char *p;
 546
 547         assert(pid >= 0);
 548
 549         p = procfs_file_alloca(pid, "cwd");
 550
 551         return get_process_link_contents(p, cwd);
 552 }
 553
 554 int get_process_root(pid_t pid, char **root) {
 555         const char *p;
 556
 557         assert(pid >= 0);
 558
 559         p = procfs_file_alloca(pid, "root");
 560
 561         return get_process_link_contents(p, root);
 562 }
 563
 564 int get_process_environ(pid_t pid, char **env) {
 565         _cleanup_fclose_ FILE *f = NULL;
 566         _cleanup_free_ char *outcome = NULL;
 567         int c;
 568         const char *p;
 569         size_t allocated = 0, sz = 0;
 570
 571         assert(pid >= 0);
 572         assert(env);
 573
 574         p = procfs_file_alloca(pid, "environ");
 575
 576         f = fopen(p, "re");
 577         if (!f) {
 578                 if (errno == ENOENT)
 579                         return -ESRCH;
 580                 return -errno;
 581         }
 582
 583         (void) __fsetlocking(f, FSETLOCKING_BYCALLER);
 584
 585         while ((c = fgetc(f)) != EOF) {
 586                 if (!GREEDY_REALLOC(outcome, allocated, sz + 5))
 587                         return -ENOMEM;
 588
 589                 if (c == '\0')
 590                         outcome[sz++] = '\n';
 591                 else
 592                         sz += cescape_char(c, outcome + sz);
 593         }
 594
 595         if (!outcome) {
 596                 outcome = strdup("");
 597                 if (!outcome)
 598                         return -ENOMEM;
 599         } else
 600                 outcome[sz] = '\0';
 601
 602         *env = outcome;
 603         outcome = NULL;
 604
 605         return 0;
 606 }
 607
 608 int get_process_ppid(pid_t pid, pid_t *_ppid) {
 609         int r;
 610         _cleanup_free_ char *line = NULL;
 611         long unsigned ppid;
 612         const char *p;
 613
 614         assert(pid >= 0);
 615         assert(_ppid);
 616
 617         if (pid == 0 || pid == getpid_cached()) {
 618                 *_ppid = getppid();
 619                 return 0;
 620         }
 621
 622         p = procfs_file_alloca(pid, "stat");
 623         r = read_one_line_file(p, &line);
 624         if (r == -ENOENT)
 625                 return -ESRCH;
 626         if (r < 0)
 627                 return r;
 628
 629         /* Let's skip the pid and comm fields. The latter is enclosed
 630          * in () but does not escape any () in its value, so let's
 631          * skip over it manually */
 632
 633         p = strrchr(line, ')');
 634         if (!p)
 635                 return -EIO;
 636
 637         p++;
 638
 639         if (sscanf(p, " "
 640                    "%*c "  /* state */
 641                    "%lu ", /* ppid */
 642                    &ppid) != 1)
 643                 return -EIO;
 644
 645         if ((long unsigned) (pid_t) ppid != ppid)
 646                 return -ERANGE;
 647
 648         *_ppid = (pid_t) ppid;
 649
 650         return 0;
 651 }
 652
 653 int wait_for_terminate(pid_t pid, siginfo_t *status) {
 654         siginfo_t dummy;
 655
 656         assert(pid >= 1);
 657
 658         if (!status)
 659                 status = &dummy;
 660
 661         for (;;) {
 662                 zero(*status);
 663
 664                 if (waitid(P_PID, pid, status, WEXITED) < 0) {
 665
 666                         if (errno == EINTR)
 667                                 continue;
 668
 669                         return negative_errno();
 670                 }
 671
 672                 return 0;
 673         }
 674 }
 675
 676 /*
 677  * Return values:
 678  * < 0 : wait_for_terminate() failed to get the state of the
 679  *       process, the process was terminated by a signal, or
 680  *       failed for an unknown reason.
 681  * >=0 : The process terminated normally, and its exit code is
 682  *       returned.
 683  *
 684  * That is, success is indicated by a return value of zero, and an
 685  * error is indicated by a non-zero value.
 686  *
 687  * A warning is emitted if the process terminates abnormally,
 688  * and also if it returns non-zero unless check_exit_code is true.
 689  */
 690 int wait_for_terminate_and_check(const char *name, pid_t pid, WaitFlags flags) {
 691         _cleanup_free_ char *buffer = NULL;
 692         siginfo_t status;
 693         int r, prio;
 694
 695         assert(pid > 1);
 696
 697         if (!name) {
 698                 r = get_process_comm(pid, &buffer);
 699                 if (r < 0)
 700                         log_debug_errno(r, "Failed to acquire process name of " PID_FMT ", ignoring: %m", pid);
 701                 else
 702                         name = buffer;
 703         }
 704
 705         prio = flags & WAIT_LOG_ABNORMAL ? LOG_ERR : LOG_DEBUG;
 706
 707         r = wait_for_terminate(pid, &status);
 708         if (r < 0)
 709                 return log_full_errno(prio, r, "Failed to wait for %s: %m", strna(name));
 710
 711         if (status.si_code == CLD_EXITED) {
 712                 if (status.si_status != EXIT_SUCCESS)
 713                         log_full(flags & WAIT_LOG_NON_ZERO_EXIT_STATUS ? LOG_ERR : LOG_DEBUG,
 714                                  "%s failed with exit status %i.", strna(name), status.si_status);
 715                 else
 716                         log_debug("%s succeeded.", name);
 717
 718                 return status.si_status;
 719
 720         } else if (IN_SET(status.si_code, CLD_KILLED, CLD_DUMPED)) {
 721
 722                 log_full(prio, "%s terminated by signal %s.", strna(name), signal_to_string(status.si_status));
 723                 return -EPROTO;
 724         }
 725
 726         log_full(prio, "%s failed due to unknown reason.", strna(name));
 727         return -EPROTO;
 728 }
 729
 730 /*
 731  * Return values:
 732  * < 0 : wait_for_terminate_with_timeout() failed to get the state of the
 733  *       process, the process timed out, the process was terminated by a
 734  *       signal, or failed for an unknown reason.
 735  * >=0 : The process terminated normally with no failures.
 736  *
 737  * Success is indicated by a return value of zero, a timeout is indicated
 738  * by ETIMEDOUT, and all other child failure states are indicated by error
 739  * is indicated by a non-zero value.
 740  */
 741 int wait_for_terminate_with_timeout(pid_t pid, usec_t timeout) {
 742         sigset_t mask;
 743         int r;
 744         usec_t until;
 745
 746         assert_se(sigemptyset(&mask) == 0);
 747         assert_se(sigaddset(&mask, SIGCHLD) == 0);
 748
 749         /* Drop into a sigtimewait-based timeout. Waiting for the
 750          * pid to exit. */
 751         until = now(CLOCK_MONOTONIC) + timeout;
 752         for (;;) {
 753                 usec_t n;
 754                 siginfo_t status = {};
 755                 struct timespec ts;
 756
 757                 n = now(CLOCK_MONOTONIC);
 758                 if (n >= until)
 759                         break;
 760
 761                 r = sigtimedwait(&mask, NULL, timespec_store(&ts, until - n)) < 0 ? -errno : 0;
 762                 /* Assuming we woke due to the child exiting. */
 763                 if (waitid(P_PID, pid, &status, WEXITED|WNOHANG) == 0) {
 764                         if (status.si_pid == pid) {
 765                                 /* This is the correct child.*/
 766                                 if (status.si_code == CLD_EXITED)
 767                                         return (status.si_status == 0) ? 0 : -EPROTO;
 768                                 else
 769                                         return -EPROTO;
 770                         }
 771                 }
 772                 /* Not the child, check for errors and proceed appropriately */
 773                 if (r < 0) {
 774                         switch (r) {
 775                         case -EAGAIN:
 776                                 /* Timed out, child is likely hung. */
 777                                 return -ETIMEDOUT;
 778                         case -EINTR:
 779                                 /* Received a different signal and should retry */
 780                                 continue;
 781                         default:
 782                                 /* Return any unexpected errors */
 783                                 return r;
 784                         }
 785                 }
 786         }
 787
 788         return -EPROTO;
 789 }
 790
 791 void sigkill_wait(pid_t pid) {
 792         assert(pid > 1);
 793
 794         if (kill(pid, SIGKILL) > 0)
 795                 (void) wait_for_terminate(pid, NULL);
 796 }
 797
 798 void sigkill_waitp(pid_t *pid) {
 799         PROTECT_ERRNO;
 800
 801         if (!pid)
 802                 return;
 803         if (*pid <= 1)
 804                 return;
 805
 806         sigkill_wait(*pid);
 807 }
 808
 809 int kill_and_sigcont(pid_t pid, int sig) {
 810         int r;
 811
 812         r = kill(pid, sig) < 0 ? -errno : 0;
 813
 814         /* If this worked, also send SIGCONT, unless we already just sent a SIGCONT, or SIGKILL was sent which isn't
 815          * affected by a process being suspended anyway. */
 816         if (r >= 0 && !IN_SET(sig, SIGCONT, SIGKILL))
 817                 (void) kill(pid, SIGCONT);
 818
 819         return r;
 820 }
 821
 822 int getenv_for_pid(pid_t pid, const char *field, char **_value) {
 823         _cleanup_fclose_ FILE *f = NULL;
 824         char *value = NULL;
 825         int r;
 826         bool done = false;
 827         size_t l;
 828         const char *path;
 829
 830         assert(pid >= 0);
 831         assert(field);
 832         assert(_value);
 833
 834         path = procfs_file_alloca(pid, "environ");
 835
 836         f = fopen(path, "re");
 837         if (!f) {
 838                 if (errno == ENOENT)
 839                         return -ESRCH;
 840                 return -errno;
 841         }
 842
 843         (void) __fsetlocking(f, FSETLOCKING_BYCALLER);
 844
 845         l = strlen(field);
 846         r = 0;
 847
 848         do {
 849                 char line[LINE_MAX];
 850                 unsigned i;
 851
 852                 for (i = 0; i < sizeof(line)-1; i++) {
 853                         int c;
 854
 855                         c = getc(f);
 856                         if (_unlikely_(c == EOF)) {
 857                                 done = true;
 858                                 break;
 859                         } else if (c == 0)
 860                                 break;
 861
 862                         line[i] = c;
 863                 }
 864                 line[i] = 0;
 865
 866                 if (strneq(line, field, l) && line[l] == '=') {
 867                         value = strdup(line + l + 1);
 868                         if (!value)
 869                                 return -ENOMEM;
 870
 871                         r = 1;
 872                         break;
 873                 }
 874
 875         } while (!done);
 876
 877         *_value = value;
 878         return r;
 879 }
 880
 881 bool pid_is_unwaited(pid_t pid) {
 882         /* Checks whether a PID is still valid at all, including a zombie */
 883
 884         if (pid < 0)
 885                 return false;
 886
 887         if (pid <= 1) /* If we or PID 1 would be dead and have been waited for, this code would not be running */
 888                 return true;
 889
 890         if (pid == getpid_cached())
 891                 return true;
 892
 893         if (kill(pid, 0) >= 0)
 894                 return true;
 895
 896         return errno != ESRCH;
 897 }
 898
 899 bool pid_is_alive(pid_t pid) {
 900         int r;
 901
 902         /* Checks whether a PID is still valid and not a zombie */
 903
 904         if (pid < 0)
 905                 return false;
 906
 907         if (pid <= 1) /* If we or PID 1 would be a zombie, this code would not be running */
 908                 return true;
 909
 910         if (pid == getpid_cached())
 911                 return true;
 912
 913         r = get_process_state(pid);
 914         if (IN_SET(r, -ESRCH, 'Z'))
 915                 return false;
 916
 917         return true;
 918 }
 919
 920 int pid_from_same_root_fs(pid_t pid) {
 921         const char *root;
 922
 923         if (pid < 0)
 924                 return false;
 925
 926         if (pid == 0 || pid == getpid_cached())
 927                 return true;
 928
 929         root = procfs_file_alloca(pid, "root");
 930
 931         return files_same(root, "/proc/1/root", 0);
 932 }
 933
 934 bool is_main_thread(void) {
 935         static thread_local int cached = 0;
 936
 937         if (_unlikely_(cached == 0))
 938                 cached = getpid_cached() == gettid() ? 1 : -1;
 939
 940         return cached > 0;
 941 }
 942
 943 noreturn void freeze(void) {
 944
 945         log_close();
 946
 947         /* Make sure nobody waits for us on a socket anymore */
 948         close_all_fds(NULL, 0);
 949
 950         sync();
 951
 952         /* Let's not freeze right away, but keep reaping zombies. */
 953         for (;;) {
 954                 int r;
 955                 siginfo_t si = {};
 956
 957                 r = waitid(P_ALL, 0, &si, WEXITED);
 958                 if (r < 0 && errno != EINTR)
 959                         break;
 960         }
 961
 962         /* waitid() failed with an unexpected error, things are really borked. Freeze now! */
 963         for (;;)
 964                 pause();
 965 }
 966
 967 bool oom_score_adjust_is_valid(int oa) {
 968         return oa >= OOM_SCORE_ADJ_MIN && oa <= OOM_SCORE_ADJ_MAX;
 969 }
 970
 971 unsigned long personality_from_string(const char *p) {
 972         int architecture;
 973
 974         if (!p)
 975                 return PERSONALITY_INVALID;
 976
 977         /* Parse a personality specifier. We use our own identifiers that indicate specific ABIs, rather than just
 978          * hints regarding the register size, since we want to keep things open for multiple locally supported ABIs for
 979          * the same register size. */
 980
 981         architecture = architecture_from_string(p);
 982         if (architecture < 0)
 983                 return PERSONALITY_INVALID;
 984
 985         if (architecture == native_architecture())
 986                 return PER_LINUX;
 987 #ifdef SECONDARY_ARCHITECTURE
 988         if (architecture == SECONDARY_ARCHITECTURE)
 989                 return PER_LINUX32;
 990 #endif
 991
 992         return PERSONALITY_INVALID;
 993 }
 994
 995 const char* personality_to_string(unsigned long p) {
 996         int architecture = _ARCHITECTURE_INVALID;
 997
 998         if (p == PER_LINUX)
 999                 architecture = native_architecture();
1000 #ifdef SECONDARY_ARCHITECTURE
1001         else if (p == PER_LINUX32)
1002                 architecture = SECONDARY_ARCHITECTURE;
1003 #endif
1004
1005         if (architecture < 0)
1006                 return NULL;
1007
1008         return architecture_to_string(architecture);
1009 }
1010
1011 int safe_personality(unsigned long p) {
1012         int ret;
1013
1014         /* So here's the deal, personality() is weirdly defined by glibc. In some cases it returns a failure via errno,
1015          * and in others as negative return value containing an errno-like value. Let's work around this: this is a
1016          * wrapper that uses errno if it is set, and uses the return value otherwise. And then it sets both errno and
1017          * the return value indicating the same issue, so that we are definitely on the safe side.
1018          *
1019          * See https://github.com/systemd/systemd/issues/6737 */
1020
1021         errno = 0;
1022         ret = personality(p);
1023         if (ret < 0) {
1024                 if (errno != 0)
1025                         return -errno;
1026
1027                 errno = -ret;
1028         }
1029
1030         return ret;
1031 }
1032
1033 int opinionated_personality(unsigned long *ret) {
1034         int current;
1035
1036         /* Returns the current personality, or PERSONALITY_INVALID if we can't determine it. This function is a bit
1037          * opinionated though, and ignores all the finer-grained bits and exotic personalities, only distinguishing the
1038          * two most relevant personalities: PER_LINUX and PER_LINUX32. */
1039
1040         current = safe_personality(PERSONALITY_INVALID);
1041         if (current < 0)
1042                 return current;
1043
1044         if (((unsigned long) current & 0xffff) == PER_LINUX32)
1045                 *ret = PER_LINUX32;
1046         else
1047                 *ret = PER_LINUX;
1048
1049         return 0;
1050 }
1051
1052 void valgrind_summary_hack(void) {
1053 #if HAVE_VALGRIND_VALGRIND_H
1054         if (getpid_cached() == 1 && RUNNING_ON_VALGRIND) {
1055                 pid_t pid;
1056                 pid = raw_clone(SIGCHLD);
1057                 if (pid < 0)
1058                         log_emergency_errno(errno, "Failed to fork off valgrind helper: %m");
1059                 else if (pid == 0)
1060                         exit(EXIT_SUCCESS);
1061                 else {
1062                         log_info("Spawned valgrind helper as PID "PID_FMT".", pid);
1063                         (void) wait_for_terminate(pid, NULL);
1064                 }
1065         }
1066 #endif
1067 }
1068
1069 int pid_compare_func(const void *a, const void *b) {
1070         const pid_t *p = a, *q = b;
1071
1072         /* Suitable for usage in qsort() */
1073
1074         if (*p < *q)
1075                 return -1;
1076         if (*p > *q)
1077                 return 1;
1078         return 0;
1079 }
1080
1081 int ioprio_parse_priority(const char *s, int *ret) {
1082         int i, r;
1083
1084         assert(s);
1085         assert(ret);
1086
1087         r = safe_atoi(s, &i);
1088         if (r < 0)
1089                 return r;
1090
1091         if (!ioprio_priority_is_valid(i))
1092                 return -EINVAL;
1093
1094         *ret = i;
1095         return 0;
1096 }
1097
1098 /* The cached PID, possible values:
1099  *
1100  *     == UNSET [0]  → cache not initialized yet
1101  *     == BUSY [-1]  → some thread is initializing it at the moment
1102  *     any other     → the cached PID
1103  */
1104
1105 #define CACHED_PID_UNSET ((pid_t) 0)
1106 #define CACHED_PID_BUSY ((pid_t) -1)
1107
1108 static pid_t cached_pid = CACHED_PID_UNSET;
1109
1110 void reset_cached_pid(void) {
1111         /* Invoked in the child after a fork(), i.e. at the first moment the PID changed */
1112         cached_pid = CACHED_PID_UNSET;
1113 }
1114
1115 /* We use glibc __register_atfork() + __dso_handle directly here, as they are not included in the glibc
1116  * headers. __register_atfork() is mostly equivalent to pthread_atfork(), but doesn't require us to link against
1117  * libpthread, as it is part of glibc anyway. */
1118 extern int __register_atfork(void (*prepare) (void), void (*parent) (void), void (*child) (void), void * __dso_handle);
1119 extern void* __dso_handle __attribute__ ((__weak__));
1120
1121 pid_t getpid_cached(void) {
1122         pid_t current_value;
1123
1124         /* getpid_cached() is much like getpid(), but caches the value in local memory, to avoid having to invoke a
1125          * system call each time. This restores glibc behaviour from before 2.24, when getpid() was unconditionally
1126          * cached. Starting with 2.24 getpid() started to become prohibitively expensive when used for detecting when
1127          * objects were used across fork()s. With this caching the old behaviour is somewhat restored.
1128          *
1129          * https://bugzilla.redhat.com/show_bug.cgi?id=1443976
1130          * https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=c579f48edba88380635ab98cb612030e3ed8691e
1131          */
1132
1133         current_value = __sync_val_compare_and_swap(&cached_pid, CACHED_PID_UNSET, CACHED_PID_BUSY);
1134
1135         switch (current_value) {
1136
1137         case CACHED_PID_UNSET: { /* Not initialized yet, then do so now */
1138                 pid_t new_pid;
1139
1140                 new_pid = raw_getpid();
1141
1142                 if (__register_atfork(NULL, NULL, reset_cached_pid, __dso_handle) != 0) {
1143                         /* OOM? Let's try again later */
1144                         cached_pid = CACHED_PID_UNSET;
1145                         return new_pid;
1146                 }
1147
1148                 cached_pid = new_pid;
1149                 return new_pid;
1150         }
1151
1152         case CACHED_PID_BUSY: /* Somebody else is currently initializing */
1153                 return raw_getpid();
1154
1155         default: /* Properly initialized */
1156                 return current_value;
1157         }
1158 }
1159
1160 int must_be_root(void) {
1161
1162         if (geteuid() == 0)
1163                 return 0;
1164
1165         log_error("Need to be root.");
1166         return -EPERM;
1167 }
1168
1169 int safe_fork_full(
1170                 const char *name,
1171                 const int except_fds[],
1172                 size_t n_except_fds,
1173                 ForkFlags flags,
1174                 pid_t *ret_pid) {
1175
1176         pid_t original_pid, pid;
1177         sigset_t saved_ss, ss;
1178         bool block_signals = false;
1179         int prio, r;
1180
1181         /* A wrapper around fork(), that does a couple of important initializations in addition to mere forking. Always
1182          * returns the child's PID in *ret_pid. Returns == 0 in the child, and > 0 in the parent. */
1183
1184         prio = flags & FORK_LOG ? LOG_ERR : LOG_DEBUG;
1185
1186         original_pid = getpid_cached();
1187
1188         if (flags & (FORK_RESET_SIGNALS|FORK_DEATHSIG)) {
1189
1190                 /* We temporarily block all signals, so that the new child has them blocked initially. This way, we can
1191                  * be sure that SIGTERMs are not lost we might send to the child. */
1192
1193                 if (sigfillset(&ss) < 0)
1194                         return log_full_errno(prio, errno, "Failed to reset signal set: %m");
1195
1196                 block_signals = true;
1197
1198         } else if (flags & FORK_WAIT) {
1199
1200                 /* Let's block SIGCHLD at least, so that we can safely watch for the child process */
1201
1202                 if (sigemptyset(&ss) < 0)
1203                         return log_full_errno(prio, errno, "Failed to clear signal set: %m");
1204
1205                 if (sigaddset(&ss, SIGCHLD) < 0)
1206                         return log_full_errno(prio, errno, "Failed to add SIGCHLD to signal set: %m");
1207
1208                 block_signals = true;
1209         }
1210
1211         if (block_signals)
1212                 if (sigprocmask(SIG_SETMASK, &ss, &saved_ss) < 0)
1213                         return log_full_errno(prio, errno, "Failed to set signal mask: %m");
1214
1215         if (flags & FORK_NEW_MOUNTNS)
1216                 pid = raw_clone(SIGCHLD|CLONE_NEWNS);
1217         else
1218                 pid = fork();
1219         if (pid < 0) {
1220                 r = -errno;
1221
1222                 if (block_signals) /* undo what we did above */
1223                         (void) sigprocmask(SIG_SETMASK, &saved_ss, NULL);
1224
1225                 return log_full_errno(prio, r, "Failed to fork: %m");
1226         }
1227         if (pid > 0) {
1228                 /* We are in the parent process */
1229
1230                 log_debug("Successfully forked off '%s' as PID " PID_FMT ".", strna(name), pid);
1231
1232                 if (flags & FORK_WAIT) {
1233                         r = wait_for_terminate_and_check(name, pid, (flags & FORK_LOG ? WAIT_LOG : 0));
1234                         if (r < 0)
1235                                 return r;
1236                         if (r != EXIT_SUCCESS) /* exit status > 0 should be treated as failure, too */
1237                                 return -EPROTO;
1238                 }
1239
1240                 if (block_signals) /* undo what we did above */
1241                         (void) sigprocmask(SIG_SETMASK, &saved_ss, NULL);
1242
1243                 if (ret_pid)
1244                         *ret_pid = pid;
1245
1246                 return 1;
1247         }
1248
1249         /* We are in the child process */
1250
1251         if (flags & FORK_REOPEN_LOG) {
1252                 /* Close the logs if requested, before we log anything. And make sure we reopen it if needed. */
1253                 log_close();
1254                 log_set_open_when_needed(true);
1255         }
1256
1257         if (name) {
1258                 r = rename_process(name);
1259                 if (r < 0)
1260                         log_full_errno(flags & FORK_LOG ? LOG_WARNING : LOG_DEBUG,
1261                                        r, "Failed to rename process, ignoring: %m");
1262         }
1263
1264         if (flags & FORK_DEATHSIG)
1265                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0) {
1266                         log_full_errno(prio, errno, "Failed to set death signal: %m");
1267                         _exit(EXIT_FAILURE);
1268                 }
1269
1270         if (flags & FORK_RESET_SIGNALS) {
1271                 r = reset_all_signal_handlers();
1272                 if (r < 0) {
1273                         log_full_errno(prio, r, "Failed to reset signal handlers: %m");
1274                         _exit(EXIT_FAILURE);
1275                 }
1276
1277                 /* This implicitly undoes the signal mask stuff we did before the fork()ing above */
1278                 r = reset_signal_mask();
1279                 if (r < 0) {
1280                         log_full_errno(prio, r, "Failed to reset signal mask: %m");
1281                         _exit(EXIT_FAILURE);
1282                 }
1283         } else if (block_signals) { /* undo what we did above */
1284                 if (sigprocmask(SIG_SETMASK, &saved_ss, NULL) < 0) {
1285                         log_full_errno(prio, errno, "Failed to restore signal mask: %m");
1286                         _exit(EXIT_FAILURE);
1287                 }
1288         }
1289
1290         if (flags & FORK_DEATHSIG) {
1291                 pid_t ppid;
1292                 /* Let's see if the parent PID is still the one we started from? If not, then the parent
1293                  * already died by the time we set PR_SET_PDEATHSIG, hence let's emulate the effect */
1294
1295                 ppid = getppid();
1296                 if (ppid == 0)
1297                         /* Parent is in a differn't PID namespace. */;
1298                 else if (ppid != original_pid) {
1299                         log_debug("Parent died early, raising SIGTERM.");
1300                         (void) raise(SIGTERM);
1301                         _exit(EXIT_FAILURE);
1302                 }
1303         }
1304
1305         if (flags & FORK_CLOSE_ALL_FDS) {
1306                 /* Close the logs here in case it got reopened above, as close_all_fds() would close them for us */
1307                 log_close();
1308
1309                 r = close_all_fds(except_fds, n_except_fds);
1310                 if (r < 0) {
1311                         log_full_errno(prio, r, "Failed to close all file descriptors: %m");
1312                         _exit(EXIT_FAILURE);
1313                 }
1314         }
1315
1316         /* When we were asked to reopen the logs, do so again now */
1317         if (flags & FORK_REOPEN_LOG) {
1318                 log_open();
1319                 log_set_open_when_needed(false);
1320         }
1321
1322         if (flags & FORK_NULL_STDIO) {
1323                 r = make_null_stdio();
1324                 if (r < 0) {
1325                         log_full_errno(prio, r, "Failed to connect stdin/stdout to /dev/null: %m");
1326                         _exit(EXIT_FAILURE);
1327                 }
1328         }
1329
1330         if (ret_pid)
1331                 *ret_pid = getpid_cached();
1332
1333         return 0;
1334 }
1335
1336 int fork_agent(const char *name, const int except[], unsigned n_except, pid_t *ret_pid, const char *path, ...) {
1337         bool stdout_is_tty, stderr_is_tty;
1338         unsigned n, i;
1339         va_list ap;
1340         char **l;
1341         int r;
1342
1343         assert(path);
1344
1345         /* Spawns a temporary TTY agent, making sure it goes away when we go away */
1346
1347         r = safe_fork_full(name, except, n_except, FORK_RESET_SIGNALS|FORK_DEATHSIG|FORK_CLOSE_ALL_FDS, ret_pid);
1348         if (r < 0)
1349                 return r;
1350         if (r > 0)
1351                 return 0;
1352
1353         /* In the child: */
1354
1355         stdout_is_tty = isatty(STDOUT_FILENO);
1356         stderr_is_tty = isatty(STDERR_FILENO);
1357
1358         if (!stdout_is_tty || !stderr_is_tty) {
1359                 int fd;
1360
1361                 /* Detach from stdout/stderr. and reopen
1362                  * /dev/tty for them. This is important to
1363                  * ensure that when systemctl is started via
1364                  * popen() or a similar call that expects to
1365                  * read EOF we actually do generate EOF and
1366                  * not delay this indefinitely by because we
1367                  * keep an unused copy of stdin around. */
1368                 fd = open("/dev/tty", O_WRONLY);
1369                 if (fd < 0) {
1370                         log_error_errno(errno, "Failed to open /dev/tty: %m");
1371                         _exit(EXIT_FAILURE);
1372                 }
1373
1374                 if (!stdout_is_tty && dup2(fd, STDOUT_FILENO) < 0) {
1375                         log_error_errno(errno, "Failed to dup2 /dev/tty: %m");
1376                         _exit(EXIT_FAILURE);
1377                 }
1378
1379                 if (!stderr_is_tty && dup2(fd, STDERR_FILENO) < 0) {
1380                         log_error_errno(errno, "Failed to dup2 /dev/tty: %m");
1381                         _exit(EXIT_FAILURE);
1382                 }
1383
1384                 if (fd > STDERR_FILENO)
1385                         close(fd);
1386         }
1387
1388         /* Count arguments */
1389         va_start(ap, path);
1390         for (n = 0; va_arg(ap, char*); n++)
1391                 ;
1392         va_end(ap);
1393
1394         /* Allocate strv */
1395         l = alloca(sizeof(char *) * (n + 1));
1396
1397         /* Fill in arguments */
1398         va_start(ap, path);
1399         for (i = 0; i <= n; i++)
1400                 l[i] = va_arg(ap, char*);
1401         va_end(ap);
1402
1403         execv(path, l);
1404         _exit(EXIT_FAILURE);
1405 }
1406
1407 static const char *const ioprio_class_table[] = {
1408         [IOPRIO_CLASS_NONE] = "none",
1409         [IOPRIO_CLASS_RT] = "realtime",
1410         [IOPRIO_CLASS_BE] = "best-effort",
1411         [IOPRIO_CLASS_IDLE] = "idle"
1412 };
1413
1414 DEFINE_STRING_TABLE_LOOKUP_WITH_FALLBACK(ioprio_class, int, INT_MAX);
1415
1416 static const char *const sigchld_code_table[] = {
1417         [CLD_EXITED] = "exited",
1418         [CLD_KILLED] = "killed",
1419         [CLD_DUMPED] = "dumped",
1420         [CLD_TRAPPED] = "trapped",
1421         [CLD_STOPPED] = "stopped",
1422         [CLD_CONTINUED] = "continued",
1423 };
1424
1425 DEFINE_STRING_TABLE_LOOKUP(sigchld_code, int);
1426
1427 static const char* const sched_policy_table[] = {
1428         [SCHED_OTHER] = "other",
1429         [SCHED_BATCH] = "batch",
1430         [SCHED_IDLE] = "idle",
1431         [SCHED_FIFO] = "fifo",
1432         [SCHED_RR] = "rr"
1433 };
1434
1435 DEFINE_STRING_TABLE_LOOKUP_WITH_FALLBACK(sched_policy, int, INT_MAX);