src/basic/process-util.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2
   3 #include <ctype.h>
   4 #include <errno.h>
   5 #include <limits.h>
   6 #include <linux/oom.h>
   7 #include <sched.h>
   8 #include <signal.h>
   9 #include <stdbool.h>
  10 #include <stdio.h>
  11 #include <stdlib.h>
  12 #include <string.h>
  13 #include <sys/mman.h>
  14 #include <sys/mount.h>
  15 #include <sys/personality.h>
  16 #include <sys/prctl.h>
  17 #include <sys/types.h>
  18 #include <sys/wait.h>
  19 #include <syslog.h>
  20 #include <unistd.h>
  21 #if HAVE_VALGRIND_VALGRIND_H
  22 #include <valgrind/valgrind.h>
  23 #endif
  24
  25 #include "alloc-util.h"
  26 #include "architecture.h"
  27 #include "escape.h"
  28 #include "env-util.h"
  29 #include "fd-util.h"
  30 #include "fileio.h"
  31 #include "fs-util.h"
  32 #include "ioprio.h"
  33 #include "locale-util.h"
  34 #include "log.h"
  35 #include "macro.h"
  36 #include "memory-util.h"
  37 #include "missing.h"
  38 #include "namespace-util.h"
  39 #include "process-util.h"
  40 #include "raw-clone.h"
  41 #include "rlimit-util.h"
  42 #include "signal-util.h"
  43 #include "stat-util.h"
  44 #include "string-table.h"
  45 #include "string-util.h"
  46 #include "terminal-util.h"
  47 #include "user-util.h"
  48 #include "utf8.h"
  49
  50 /* The kernel limits userspace processes to TASK_COMM_LEN (16 bytes), but allows higher values for its own
  51  * workers, e.g. "kworker/u9:3-kcryptd/253:0". Let's pick a fixed smallish limit that will work for the kernel.
  52  */
  53 #define COMM_MAX_LEN 128
  54
  55 static int get_process_state(pid_t pid) {
  56         const char *p;
  57         char state;
  58         int r;
  59         _cleanup_free_ char *line = NULL;
  60
  61         assert(pid >= 0);
  62
  63         p = procfs_file_alloca(pid, "stat");
  64
  65         r = read_one_line_file(p, &line);
  66         if (r == -ENOENT)
  67                 return -ESRCH;
  68         if (r < 0)
  69                 return r;
  70
  71         p = strrchr(line, ')');
  72         if (!p)
  73                 return -EIO;
  74
  75         p++;
  76
  77         if (sscanf(p, " %c", &state) != 1)
  78                 return -EIO;
  79
  80         return (unsigned char) state;
  81 }
  82
  83 int get_process_comm(pid_t pid, char **ret) {
  84         _cleanup_free_ char *escaped = NULL, *comm = NULL;
  85         const char *p;
  86         int r;
  87
  88         assert(ret);
  89         assert(pid >= 0);
  90
  91         escaped = new(char, COMM_MAX_LEN);
  92         if (!escaped)
  93                 return -ENOMEM;
  94
  95         p = procfs_file_alloca(pid, "comm");
  96
  97         r = read_one_line_file(p, &comm);
  98         if (r == -ENOENT)
  99                 return -ESRCH;
 100         if (r < 0)
 101                 return r;
 102
 103         /* Escape unprintable characters, just in case, but don't grow the string beyond the underlying size */
 104         cellescape(escaped, COMM_MAX_LEN, comm);
 105
 106         *ret = TAKE_PTR(escaped);
 107         return 0;
 108 }
 109
 110 int get_process_cmdline(pid_t pid, size_t max_columns, ProcessCmdlineFlags flags, char **line) {
 111         _cleanup_fclose_ FILE *f = NULL;
 112         _cleanup_free_ char *t = NULL, *ans = NULL;
 113         const char *p;
 114         int r;
 115         size_t k;
 116
 117         /* This is supposed to be a safety guard against runaway command lines. */
 118         size_t max_length = sc_arg_max();
 119
 120         assert(line);
 121         assert(pid >= 0);
 122
 123         /* Retrieves a process' command line. Replaces non-utf8 bytes by replacement character (�). If
 124          * max_columns is != -1 will return a string of the specified console width at most, abbreviated with
 125          * an ellipsis. If PROCESS_CMDLINE_COMM_FALLBACK is specified in flags and the process has no command
 126          * line set (the case for kernel threads), or has a command line that resolves to the empty string
 127          * will return the "comm" name of the process instead. This will use at most _SC_ARG_MAX bytes of
 128          * input data.
 129          *
 130          * Returns -ESRCH if the process doesn't exist, and -ENOENT if the process has no command line (and
 131          * comm_fallback is false). Returns 0 and sets *line otherwise. */
 132
 133         p = procfs_file_alloca(pid, "cmdline");
 134         r = fopen_unlocked(p, "re", &f);
 135         if (r == -ENOENT)
 136                 return -ESRCH;
 137         if (r < 0)
 138                 return r;
 139
 140         /* We assume that each four-byte character uses one or two columns. If we ever check for combining
 141          * characters, this assumption will need to be adjusted. */
 142         if ((size_t) 4 * max_columns + 1 < max_columns)
 143                 max_length = MIN(max_length, (size_t) 4 * max_columns + 1);
 144
 145         t = new(char, max_length);
 146         if (!t)
 147                 return -ENOMEM;
 148
 149         k = fread(t, 1, max_length, f);
 150         if (k > 0) {
 151                 /* Arguments are separated by NULs. Let's replace those with spaces. */
 152                 for (size_t i = 0; i < k - 1; i++)
 153                         if (t[i] == '\0')
 154                                 t[i] = ' ';
 155
 156                 t[k] = '\0'; /* Normally, t[k] is already NUL, so this is just a guard in case of short read */
 157         } else {
 158                 /* We only treat getting nothing as an error. We *could* also get an error after reading some
 159                  * data, but we ignore that case, as such an error is rather unlikely and we prefer to get
 160                  * some data rather than none. */
 161                 if (ferror(f))
 162                         return -errno;
 163
 164                 if (!(flags & PROCESS_CMDLINE_COMM_FALLBACK))
 165                         return -ENOENT;
 166
 167                 /* Kernel threads have no argv[] */
 168                 _cleanup_free_ char *t2 = NULL;
 169
 170                 r = get_process_comm(pid, &t2);
 171                 if (r < 0)
 172                         return r;
 173
 174                 mfree(t);
 175                 t = strjoin("[", t2, "]");
 176                 if (!t)
 177                         return -ENOMEM;
 178         }
 179
 180         delete_trailing_chars(t, WHITESPACE);
 181
 182         bool eight_bit = (flags & PROCESS_CMDLINE_USE_LOCALE) && !is_locale_utf8();
 183
 184         ans = escape_non_printable_full(t, max_columns, eight_bit);
 185         if (!ans)
 186                 return -ENOMEM;
 187
 188         (void) str_realloc(&ans);
 189         *line = TAKE_PTR(ans);
 190         return 0;
 191 }
 192
 193 int rename_process(const char name[]) {
 194         static size_t mm_size = 0;
 195         static char *mm = NULL;
 196         bool truncated = false;
 197         size_t l;
 198
 199         /* This is a like a poor man's setproctitle(). It changes the comm field, argv[0], and also the glibc's
 200          * internally used name of the process. For the first one a limit of 16 chars applies; to the second one in
 201          * many cases one of 10 (i.e. length of "/sbin/init") — however if we have CAP_SYS_RESOURCES it is unbounded;
 202          * to the third one 7 (i.e. the length of "systemd". If you pass a longer string it will likely be
 203          * truncated.
 204          *
 205          * Returns 0 if a name was set but truncated, > 0 if it was set but not truncated. */
 206
 207         if (isempty(name))
 208                 return -EINVAL; /* let's not confuse users unnecessarily with an empty name */
 209
 210         if (!is_main_thread())
 211                 return -EPERM; /* Let's not allow setting the process name from other threads than the main one, as we
 212                                 * cache things without locking, and we make assumptions that PR_SET_NAME sets the
 213                                 * process name that isn't correct on any other threads */
 214
 215         l = strlen(name);
 216
 217         /* First step, change the comm field. The main thread's comm is identical to the process comm. This means we
 218          * can use PR_SET_NAME, which sets the thread name for the calling thread. */
 219         if (prctl(PR_SET_NAME, name) < 0)
 220                 log_debug_errno(errno, "PR_SET_NAME failed: %m");
 221         if (l >= TASK_COMM_LEN) /* Linux userspace process names can be 15 chars at max */
 222                 truncated = true;
 223
 224         /* Second step, change glibc's ID of the process name. */
 225         if (program_invocation_name) {
 226                 size_t k;
 227
 228                 k = strlen(program_invocation_name);
 229                 strncpy(program_invocation_name, name, k);
 230                 if (l > k)
 231                         truncated = true;
 232         }
 233
 234         /* Third step, completely replace the argv[] array the kernel maintains for us. This requires privileges, but
 235          * has the advantage that the argv[] array is exactly what we want it to be, and not filled up with zeros at
 236          * the end. This is the best option for changing /proc/self/cmdline. */
 237
 238         /* Let's not bother with this if we don't have euid == 0. Strictly speaking we should check for the
 239          * CAP_SYS_RESOURCE capability which is independent of the euid. In our own code the capability generally is
 240          * present only for euid == 0, hence let's use this as quick bypass check, to avoid calling mmap() if
 241          * PR_SET_MM_ARG_{START,END} fails with EPERM later on anyway. After all geteuid() is dead cheap to call, but
 242          * mmap() is not. */
 243         if (geteuid() != 0)
 244                 log_debug("Skipping PR_SET_MM, as we don't have privileges.");
 245         else if (mm_size < l+1) {
 246                 size_t nn_size;
 247                 char *nn;
 248
 249                 nn_size = PAGE_ALIGN(l+1);
 250                 nn = mmap(NULL, nn_size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
 251                 if (nn == MAP_FAILED) {
 252                         log_debug_errno(errno, "mmap() failed: %m");
 253                         goto use_saved_argv;
 254                 }
 255
 256                 strncpy(nn, name, nn_size);
 257
 258                 /* Now, let's tell the kernel about this new memory */
 259                 if (prctl(PR_SET_MM, PR_SET_MM_ARG_START, (unsigned long) nn, 0, 0) < 0) {
 260                         /* HACK: prctl() API is kind of dumb on this point.  The existing end address may already be
 261                          * below the desired start address, in which case the kernel may have kicked this back due
 262                          * to a range-check failure (see linux/kernel/sys.c:validate_prctl_map() to see this in
 263                          * action).  The proper solution would be to have a prctl() API that could set both start+end
 264                          * simultaneously, or at least let us query the existing address to anticipate this condition
 265                          * and respond accordingly.  For now, we can only guess at the cause of this failure and try
 266                          * a workaround--which will briefly expand the arg space to something potentially huge before
 267                          * resizing it to what we want. */
 268                         log_debug_errno(errno, "PR_SET_MM_ARG_START failed, attempting PR_SET_MM_ARG_END hack: %m");
 269
 270                         if (prctl(PR_SET_MM, PR_SET_MM_ARG_END, (unsigned long) nn + l + 1, 0, 0) < 0) {
 271                                 log_debug_errno(errno, "PR_SET_MM_ARG_END hack failed, proceeding without: %m");
 272                                 (void) munmap(nn, nn_size);
 273                                 goto use_saved_argv;
 274                         }
 275
 276                         if (prctl(PR_SET_MM, PR_SET_MM_ARG_START, (unsigned long) nn, 0, 0) < 0) {
 277                                 log_debug_errno(errno, "PR_SET_MM_ARG_START still failed, proceeding without: %m");
 278                                 goto use_saved_argv;
 279                         }
 280                 } else {
 281                         /* And update the end pointer to the new end, too. If this fails, we don't really know what
 282                          * to do, it's pretty unlikely that we can rollback, hence we'll just accept the failure,
 283                          * and continue. */
 284                         if (prctl(PR_SET_MM, PR_SET_MM_ARG_END, (unsigned long) nn + l + 1, 0, 0) < 0)
 285                                 log_debug_errno(errno, "PR_SET_MM_ARG_END failed, proceeding without: %m");
 286                 }
 287
 288                 if (mm)
 289                         (void) munmap(mm, mm_size);
 290
 291                 mm = nn;
 292                 mm_size = nn_size;
 293         } else {
 294                 strncpy(mm, name, mm_size);
 295
 296                 /* Update the end pointer, continuing regardless of any failure. */
 297                 if (prctl(PR_SET_MM, PR_SET_MM_ARG_END, (unsigned long) mm + l + 1, 0, 0) < 0)
 298                         log_debug_errno(errno, "PR_SET_MM_ARG_END failed, proceeding without: %m");
 299         }
 300
 301 use_saved_argv:
 302         /* Fourth step: in all cases we'll also update the original argv[], so that our own code gets it right too if
 303          * it still looks here */
 304
 305         if (saved_argc > 0) {
 306                 int i;
 307
 308                 if (saved_argv[0]) {
 309                         size_t k;
 310
 311                         k = strlen(saved_argv[0]);
 312                         strncpy(saved_argv[0], name, k);
 313                         if (l > k)
 314                                 truncated = true;
 315                 }
 316
 317                 for (i = 1; i < saved_argc; i++) {
 318                         if (!saved_argv[i])
 319                                 break;
 320
 321                         memzero(saved_argv[i], strlen(saved_argv[i]));
 322                 }
 323         }
 324
 325         return !truncated;
 326 }
 327
 328 int is_kernel_thread(pid_t pid) {
 329         _cleanup_free_ char *line = NULL;
 330         unsigned long long flags;
 331         size_t l, i;
 332         const char *p;
 333         char *q;
 334         int r;
 335
 336         if (IN_SET(pid, 0, 1) || pid == getpid_cached()) /* pid 1, and we ourselves certainly aren't a kernel thread */
 337                 return 0;
 338         if (!pid_is_valid(pid))
 339                 return -EINVAL;
 340
 341         p = procfs_file_alloca(pid, "stat");
 342         r = read_one_line_file(p, &line);
 343         if (r == -ENOENT)
 344                 return -ESRCH;
 345         if (r < 0)
 346                 return r;
 347
 348         /* Skip past the comm field */
 349         q = strrchr(line, ')');
 350         if (!q)
 351                 return -EINVAL;
 352         q++;
 353
 354         /* Skip 6 fields to reach the flags field */
 355         for (i = 0; i < 6; i++) {
 356                 l = strspn(q, WHITESPACE);
 357                 if (l < 1)
 358                         return -EINVAL;
 359                 q += l;
 360
 361                 l = strcspn(q, WHITESPACE);
 362                 if (l < 1)
 363                         return -EINVAL;
 364                 q += l;
 365         }
 366
 367         /* Skip preceding whitespace */
 368         l = strspn(q, WHITESPACE);
 369         if (l < 1)
 370                 return -EINVAL;
 371         q += l;
 372
 373         /* Truncate the rest */
 374         l = strcspn(q, WHITESPACE);
 375         if (l < 1)
 376                 return -EINVAL;
 377         q[l] = 0;
 378
 379         r = safe_atollu(q, &flags);
 380         if (r < 0)
 381                 return r;
 382
 383         return !!(flags & PF_KTHREAD);
 384 }
 385
 386 int get_process_capeff(pid_t pid, char **capeff) {
 387         const char *p;
 388         int r;
 389
 390         assert(capeff);
 391         assert(pid >= 0);
 392
 393         p = procfs_file_alloca(pid, "status");
 394
 395         r = get_proc_field(p, "CapEff", WHITESPACE, capeff);
 396         if (r == -ENOENT)
 397                 return -ESRCH;
 398
 399         return r;
 400 }
 401
 402 static int get_process_link_contents(const char *proc_file, char **name) {
 403         int r;
 404
 405         assert(proc_file);
 406         assert(name);
 407
 408         r = readlink_malloc(proc_file, name);
 409         if (r == -ENOENT)
 410                 return -ESRCH;
 411         if (r < 0)
 412                 return r;
 413
 414         return 0;
 415 }
 416
 417 int get_process_exe(pid_t pid, char **name) {
 418         const char *p;
 419         char *d;
 420         int r;
 421
 422         assert(pid >= 0);
 423
 424         p = procfs_file_alloca(pid, "exe");
 425         r = get_process_link_contents(p, name);
 426         if (r < 0)
 427                 return r;
 428
 429         d = endswith(*name, " (deleted)");
 430         if (d)
 431                 *d = '\0';
 432
 433         return 0;
 434 }
 435
 436 static int get_process_id(pid_t pid, const char *field, uid_t *uid) {
 437         _cleanup_fclose_ FILE *f = NULL;
 438         const char *p;
 439         int r;
 440
 441         assert(field);
 442         assert(uid);
 443
 444         if (pid < 0)
 445                 return -EINVAL;
 446
 447         p = procfs_file_alloca(pid, "status");
 448         r = fopen_unlocked(p, "re", &f);
 449         if (r == -ENOENT)
 450                 return -ESRCH;
 451         if (r < 0)
 452                 return r;
 453
 454         for (;;) {
 455                 _cleanup_free_ char *line = NULL;
 456                 char *l;
 457
 458                 r = read_line(f, LONG_LINE_MAX, &line);
 459                 if (r < 0)
 460                         return r;
 461                 if (r == 0)
 462                         break;
 463
 464                 l = strstrip(line);
 465
 466                 if (startswith(l, field)) {
 467                         l += strlen(field);
 468                         l += strspn(l, WHITESPACE);
 469
 470                         l[strcspn(l, WHITESPACE)] = 0;
 471
 472                         return parse_uid(l, uid);
 473                 }
 474         }
 475
 476         return -EIO;
 477 }
 478
 479 int get_process_uid(pid_t pid, uid_t *uid) {
 480
 481         if (pid == 0 || pid == getpid_cached()) {
 482                 *uid = getuid();
 483                 return 0;
 484         }
 485
 486         return get_process_id(pid, "Uid:", uid);
 487 }
 488
 489 int get_process_gid(pid_t pid, gid_t *gid) {
 490
 491         if (pid == 0 || pid == getpid_cached()) {
 492                 *gid = getgid();
 493                 return 0;
 494         }
 495
 496         assert_cc(sizeof(uid_t) == sizeof(gid_t));
 497         return get_process_id(pid, "Gid:", gid);
 498 }
 499
 500 int get_process_cwd(pid_t pid, char **cwd) {
 501         const char *p;
 502
 503         assert(pid >= 0);
 504
 505         p = procfs_file_alloca(pid, "cwd");
 506
 507         return get_process_link_contents(p, cwd);
 508 }
 509
 510 int get_process_root(pid_t pid, char **root) {
 511         const char *p;
 512
 513         assert(pid >= 0);
 514
 515         p = procfs_file_alloca(pid, "root");
 516
 517         return get_process_link_contents(p, root);
 518 }
 519
 520 #define ENVIRONMENT_BLOCK_MAX (5U*1024U*1024U)
 521
 522 int get_process_environ(pid_t pid, char **env) {
 523         _cleanup_fclose_ FILE *f = NULL;
 524         _cleanup_free_ char *outcome = NULL;
 525         size_t allocated = 0, sz = 0;
 526         const char *p;
 527         int r;
 528
 529         assert(pid >= 0);
 530         assert(env);
 531
 532         p = procfs_file_alloca(pid, "environ");
 533
 534         r = fopen_unlocked(p, "re", &f);
 535         if (r == -ENOENT)
 536                 return -ESRCH;
 537         if (r < 0)
 538                 return r;
 539
 540         for (;;) {
 541                 char c;
 542
 543                 if (sz >= ENVIRONMENT_BLOCK_MAX)
 544                         return -ENOBUFS;
 545
 546                 if (!GREEDY_REALLOC(outcome, allocated, sz + 5))
 547                         return -ENOMEM;
 548
 549                 r = safe_fgetc(f, &c);
 550                 if (r < 0)
 551                         return r;
 552                 if (r == 0)
 553                         break;
 554
 555                 if (c == '\0')
 556                         outcome[sz++] = '\n';
 557                 else
 558                         sz += cescape_char(c, outcome + sz);
 559         }
 560
 561         outcome[sz] = '\0';
 562         *env = TAKE_PTR(outcome);
 563
 564         return 0;
 565 }
 566
 567 int get_process_ppid(pid_t pid, pid_t *_ppid) {
 568         int r;
 569         _cleanup_free_ char *line = NULL;
 570         long unsigned ppid;
 571         const char *p;
 572
 573         assert(pid >= 0);
 574         assert(_ppid);
 575
 576         if (pid == 0 || pid == getpid_cached()) {
 577                 *_ppid = getppid();
 578                 return 0;
 579         }
 580
 581         p = procfs_file_alloca(pid, "stat");
 582         r = read_one_line_file(p, &line);
 583         if (r == -ENOENT)
 584                 return -ESRCH;
 585         if (r < 0)
 586                 return r;
 587
 588         /* Let's skip the pid and comm fields. The latter is enclosed
 589          * in () but does not escape any () in its value, so let's
 590          * skip over it manually */
 591
 592         p = strrchr(line, ')');
 593         if (!p)
 594                 return -EIO;
 595
 596         p++;
 597
 598         if (sscanf(p, " "
 599                    "%*c "  /* state */
 600                    "%lu ", /* ppid */
 601                    &ppid) != 1)
 602                 return -EIO;
 603
 604         if ((long unsigned) (pid_t) ppid != ppid)
 605                 return -ERANGE;
 606
 607         *_ppid = (pid_t) ppid;
 608
 609         return 0;
 610 }
 611
 612 int wait_for_terminate(pid_t pid, siginfo_t *status) {
 613         siginfo_t dummy;
 614
 615         assert(pid >= 1);
 616
 617         if (!status)
 618                 status = &dummy;
 619
 620         for (;;) {
 621                 zero(*status);
 622
 623                 if (waitid(P_PID, pid, status, WEXITED) < 0) {
 624
 625                         if (errno == EINTR)
 626                                 continue;
 627
 628                         return negative_errno();
 629                 }
 630
 631                 return 0;
 632         }
 633 }
 634
 635 /*
 636  * Return values:
 637  * < 0 : wait_for_terminate() failed to get the state of the
 638  *       process, the process was terminated by a signal, or
 639  *       failed for an unknown reason.
 640  * >=0 : The process terminated normally, and its exit code is
 641  *       returned.
 642  *
 643  * That is, success is indicated by a return value of zero, and an
 644  * error is indicated by a non-zero value.
 645  *
 646  * A warning is emitted if the process terminates abnormally,
 647  * and also if it returns non-zero unless check_exit_code is true.
 648  */
 649 int wait_for_terminate_and_check(const char *name, pid_t pid, WaitFlags flags) {
 650         _cleanup_free_ char *buffer = NULL;
 651         siginfo_t status;
 652         int r, prio;
 653
 654         assert(pid > 1);
 655
 656         if (!name) {
 657                 r = get_process_comm(pid, &buffer);
 658                 if (r < 0)
 659                         log_debug_errno(r, "Failed to acquire process name of " PID_FMT ", ignoring: %m", pid);
 660                 else
 661                         name = buffer;
 662         }
 663
 664         prio = flags & WAIT_LOG_ABNORMAL ? LOG_ERR : LOG_DEBUG;
 665
 666         r = wait_for_terminate(pid, &status);
 667         if (r < 0)
 668                 return log_full_errno(prio, r, "Failed to wait for %s: %m", strna(name));
 669
 670         if (status.si_code == CLD_EXITED) {
 671                 if (status.si_status != EXIT_SUCCESS)
 672                         log_full(flags & WAIT_LOG_NON_ZERO_EXIT_STATUS ? LOG_ERR : LOG_DEBUG,
 673                                  "%s failed with exit status %i.", strna(name), status.si_status);
 674                 else
 675                         log_debug("%s succeeded.", name);
 676
 677                 return status.si_status;
 678
 679         } else if (IN_SET(status.si_code, CLD_KILLED, CLD_DUMPED)) {
 680
 681                 log_full(prio, "%s terminated by signal %s.", strna(name), signal_to_string(status.si_status));
 682                 return -EPROTO;
 683         }
 684
 685         log_full(prio, "%s failed due to unknown reason.", strna(name));
 686         return -EPROTO;
 687 }
 688
 689 /*
 690  * Return values:
 691  *
 692  * < 0 : wait_for_terminate_with_timeout() failed to get the state of the process, the process timed out, the process
 693  *       was terminated by a signal, or failed for an unknown reason.
 694  *
 695  * >=0 : The process terminated normally with no failures.
 696  *
 697  * Success is indicated by a return value of zero, a timeout is indicated by ETIMEDOUT, and all other child failure
 698  * states are indicated by error is indicated by a non-zero value.
 699  *
 700  * This call assumes SIGCHLD has been blocked already, in particular before the child to wait for has been forked off
 701  * to remain entirely race-free.
 702  */
 703 int wait_for_terminate_with_timeout(pid_t pid, usec_t timeout) {
 704         sigset_t mask;
 705         int r;
 706         usec_t until;
 707
 708         assert_se(sigemptyset(&mask) == 0);
 709         assert_se(sigaddset(&mask, SIGCHLD) == 0);
 710
 711         /* Drop into a sigtimewait-based timeout. Waiting for the
 712          * pid to exit. */
 713         until = now(CLOCK_MONOTONIC) + timeout;
 714         for (;;) {
 715                 usec_t n;
 716                 siginfo_t status = {};
 717                 struct timespec ts;
 718
 719                 n = now(CLOCK_MONOTONIC);
 720                 if (n >= until)
 721                         break;
 722
 723                 r = sigtimedwait(&mask, NULL, timespec_store(&ts, until - n)) < 0 ? -errno : 0;
 724                 /* Assuming we woke due to the child exiting. */
 725                 if (waitid(P_PID, pid, &status, WEXITED|WNOHANG) == 0) {
 726                         if (status.si_pid == pid) {
 727                                 /* This is the correct child.*/
 728                                 if (status.si_code == CLD_EXITED)
 729                                         return (status.si_status == 0) ? 0 : -EPROTO;
 730                                 else
 731                                         return -EPROTO;
 732                         }
 733                 }
 734                 /* Not the child, check for errors and proceed appropriately */
 735                 if (r < 0) {
 736                         switch (r) {
 737                         case -EAGAIN:
 738                                 /* Timed out, child is likely hung. */
 739                                 return -ETIMEDOUT;
 740                         case -EINTR:
 741                                 /* Received a different signal and should retry */
 742                                 continue;
 743                         default:
 744                                 /* Return any unexpected errors */
 745                                 return r;
 746                         }
 747                 }
 748         }
 749
 750         return -EPROTO;
 751 }
 752
 753 void sigkill_wait(pid_t pid) {
 754         assert(pid > 1);
 755
 756         if (kill(pid, SIGKILL) >= 0)
 757                 (void) wait_for_terminate(pid, NULL);
 758 }
 759
 760 void sigkill_waitp(pid_t *pid) {
 761         PROTECT_ERRNO;
 762
 763         if (!pid)
 764                 return;
 765         if (*pid <= 1)
 766                 return;
 767
 768         sigkill_wait(*pid);
 769 }
 770
 771 void sigterm_wait(pid_t pid) {
 772         assert(pid > 1);
 773
 774         if (kill_and_sigcont(pid, SIGTERM) >= 0)
 775                 (void) wait_for_terminate(pid, NULL);
 776 }
 777
 778 int kill_and_sigcont(pid_t pid, int sig) {
 779         int r;
 780
 781         r = kill(pid, sig) < 0 ? -errno : 0;
 782
 783         /* If this worked, also send SIGCONT, unless we already just sent a SIGCONT, or SIGKILL was sent which isn't
 784          * affected by a process being suspended anyway. */
 785         if (r >= 0 && !IN_SET(sig, SIGCONT, SIGKILL))
 786                 (void) kill(pid, SIGCONT);
 787
 788         return r;
 789 }
 790
 791 int getenv_for_pid(pid_t pid, const char *field, char **ret) {
 792         _cleanup_fclose_ FILE *f = NULL;
 793         char *value = NULL;
 794         const char *path;
 795         size_t l, sum = 0;
 796         int r;
 797
 798         assert(pid >= 0);
 799         assert(field);
 800         assert(ret);
 801
 802         if (pid == 0 || pid == getpid_cached()) {
 803                 const char *e;
 804
 805                 e = getenv(field);
 806                 if (!e) {
 807                         *ret = NULL;
 808                         return 0;
 809                 }
 810
 811                 value = strdup(e);
 812                 if (!value)
 813                         return -ENOMEM;
 814
 815                 *ret = value;
 816                 return 1;
 817         }
 818
 819         if (!pid_is_valid(pid))
 820                 return -EINVAL;
 821
 822         path = procfs_file_alloca(pid, "environ");
 823
 824         r = fopen_unlocked(path, "re", &f);
 825         if (r == -ENOENT)
 826                 return -ESRCH;
 827         if (r < 0)
 828                 return r;
 829
 830         l = strlen(field);
 831         for (;;) {
 832                 _cleanup_free_ char *line = NULL;
 833
 834                 if (sum > ENVIRONMENT_BLOCK_MAX) /* Give up searching eventually */
 835                         return -ENOBUFS;
 836
 837                 r = read_nul_string(f, LONG_LINE_MAX, &line);
 838                 if (r < 0)
 839                         return r;
 840                 if (r == 0)  /* EOF */
 841                         break;
 842
 843                 sum += r;
 844
 845                 if (strneq(line, field, l) && line[l] == '=') {
 846                         value = strdup(line + l + 1);
 847                         if (!value)
 848                                 return -ENOMEM;
 849
 850                         *ret = value;
 851                         return 1;
 852                 }
 853         }
 854
 855         *ret = NULL;
 856         return 0;
 857 }
 858
 859 int pid_is_my_child(pid_t pid) {
 860         pid_t ppid;
 861         int r;
 862
 863         if (pid <= 1)
 864                 return false;
 865
 866         r = get_process_ppid(pid, &ppid);
 867         if (r < 0)
 868                 return r;
 869
 870         return ppid == getpid_cached();
 871 }
 872
 873 bool pid_is_unwaited(pid_t pid) {
 874         /* Checks whether a PID is still valid at all, including a zombie */
 875
 876         if (pid < 0)
 877                 return false;
 878
 879         if (pid <= 1) /* If we or PID 1 would be dead and have been waited for, this code would not be running */
 880                 return true;
 881
 882         if (pid == getpid_cached())
 883                 return true;
 884
 885         if (kill(pid, 0) >= 0)
 886                 return true;
 887
 888         return errno != ESRCH;
 889 }
 890
 891 bool pid_is_alive(pid_t pid) {
 892         int r;
 893
 894         /* Checks whether a PID is still valid and not a zombie */
 895
 896         if (pid < 0)
 897                 return false;
 898
 899         if (pid <= 1) /* If we or PID 1 would be a zombie, this code would not be running */
 900                 return true;
 901
 902         if (pid == getpid_cached())
 903                 return true;
 904
 905         r = get_process_state(pid);
 906         if (IN_SET(r, -ESRCH, 'Z'))
 907                 return false;
 908
 909         return true;
 910 }
 911
 912 int pid_from_same_root_fs(pid_t pid) {
 913         const char *root;
 914
 915         if (pid < 0)
 916                 return false;
 917
 918         if (pid == 0 || pid == getpid_cached())
 919                 return true;
 920
 921         root = procfs_file_alloca(pid, "root");
 922
 923         return files_same(root, "/proc/1/root", 0);
 924 }
 925
 926 bool is_main_thread(void) {
 927         static thread_local int cached = 0;
 928
 929         if (_unlikely_(cached == 0))
 930                 cached = getpid_cached() == gettid() ? 1 : -1;
 931
 932         return cached > 0;
 933 }
 934
 935 _noreturn_ void freeze(void) {
 936
 937         log_close();
 938
 939         /* Make sure nobody waits for us on a socket anymore */
 940         (void) close_all_fds(NULL, 0);
 941
 942         sync();
 943
 944         /* Let's not freeze right away, but keep reaping zombies. */
 945         for (;;) {
 946                 int r;
 947                 siginfo_t si = {};
 948
 949                 r = waitid(P_ALL, 0, &si, WEXITED);
 950                 if (r < 0 && errno != EINTR)
 951                         break;
 952         }
 953
 954         /* waitid() failed with an unexpected error, things are really borked. Freeze now! */
 955         for (;;)
 956                 pause();
 957 }
 958
 959 bool oom_score_adjust_is_valid(int oa) {
 960         return oa >= OOM_SCORE_ADJ_MIN && oa <= OOM_SCORE_ADJ_MAX;
 961 }
 962
 963 unsigned long personality_from_string(const char *p) {
 964         int architecture;
 965
 966         if (!p)
 967                 return PERSONALITY_INVALID;
 968
 969         /* Parse a personality specifier. We use our own identifiers that indicate specific ABIs, rather than just
 970          * hints regarding the register size, since we want to keep things open for multiple locally supported ABIs for
 971          * the same register size. */
 972
 973         architecture = architecture_from_string(p);
 974         if (architecture < 0)
 975                 return PERSONALITY_INVALID;
 976
 977         if (architecture == native_architecture())
 978                 return PER_LINUX;
 979 #ifdef SECONDARY_ARCHITECTURE
 980         if (architecture == SECONDARY_ARCHITECTURE)
 981                 return PER_LINUX32;
 982 #endif
 983
 984         return PERSONALITY_INVALID;
 985 }
 986
 987 const char* personality_to_string(unsigned long p) {
 988         int architecture = _ARCHITECTURE_INVALID;
 989
 990         if (p == PER_LINUX)
 991                 architecture = native_architecture();
 992 #ifdef SECONDARY_ARCHITECTURE
 993         else if (p == PER_LINUX32)
 994                 architecture = SECONDARY_ARCHITECTURE;
 995 #endif
 996
 997         if (architecture < 0)
 998                 return NULL;
 999
1000         return architecture_to_string(architecture);
1001 }
1002
1003 int safe_personality(unsigned long p) {
1004         int ret;
1005
1006         /* So here's the deal, personality() is weirdly defined by glibc. In some cases it returns a failure via errno,
1007          * and in others as negative return value containing an errno-like value. Let's work around this: this is a
1008          * wrapper that uses errno if it is set, and uses the return value otherwise. And then it sets both errno and
1009          * the return value indicating the same issue, so that we are definitely on the safe side.
1010          *
1011          * See https://github.com/systemd/systemd/issues/6737 */
1012
1013         errno = 0;
1014         ret = personality(p);
1015         if (ret < 0) {
1016                 if (errno != 0)
1017                         return -errno;
1018
1019                 errno = -ret;
1020         }
1021
1022         return ret;
1023 }
1024
1025 int opinionated_personality(unsigned long *ret) {
1026         int current;
1027
1028         /* Returns the current personality, or PERSONALITY_INVALID if we can't determine it. This function is a bit
1029          * opinionated though, and ignores all the finer-grained bits and exotic personalities, only distinguishing the
1030          * two most relevant personalities: PER_LINUX and PER_LINUX32. */
1031
1032         current = safe_personality(PERSONALITY_INVALID);
1033         if (current < 0)
1034                 return current;
1035
1036         if (((unsigned long) current & 0xffff) == PER_LINUX32)
1037                 *ret = PER_LINUX32;
1038         else
1039                 *ret = PER_LINUX;
1040
1041         return 0;
1042 }
1043
1044 void valgrind_summary_hack(void) {
1045 #if HAVE_VALGRIND_VALGRIND_H
1046         if (getpid_cached() == 1 && RUNNING_ON_VALGRIND) {
1047                 pid_t pid;
1048                 pid = raw_clone(SIGCHLD);
1049                 if (pid < 0)
1050                         log_emergency_errno(errno, "Failed to fork off valgrind helper: %m");
1051                 else if (pid == 0)
1052                         exit(EXIT_SUCCESS);
1053                 else {
1054                         log_info("Spawned valgrind helper as PID "PID_FMT".", pid);
1055                         (void) wait_for_terminate(pid, NULL);
1056                 }
1057         }
1058 #endif
1059 }
1060
1061 int pid_compare_func(const pid_t *a, const pid_t *b) {
1062         /* Suitable for usage in qsort() */
1063         return CMP(*a, *b);
1064 }
1065
1066 int ioprio_parse_priority(const char *s, int *ret) {
1067         int i, r;
1068
1069         assert(s);
1070         assert(ret);
1071
1072         r = safe_atoi(s, &i);
1073         if (r < 0)
1074                 return r;
1075
1076         if (!ioprio_priority_is_valid(i))
1077                 return -EINVAL;
1078
1079         *ret = i;
1080         return 0;
1081 }
1082
1083 /* The cached PID, possible values:
1084  *
1085  *     == UNSET [0]  → cache not initialized yet
1086  *     == BUSY [-1]  → some thread is initializing it at the moment
1087  *     any other     → the cached PID
1088  */
1089
1090 #define CACHED_PID_UNSET ((pid_t) 0)
1091 #define CACHED_PID_BUSY ((pid_t) -1)
1092
1093 static pid_t cached_pid = CACHED_PID_UNSET;
1094
1095 void reset_cached_pid(void) {
1096         /* Invoked in the child after a fork(), i.e. at the first moment the PID changed */
1097         cached_pid = CACHED_PID_UNSET;
1098 }
1099
1100 /* We use glibc __register_atfork() + __dso_handle directly here, as they are not included in the glibc
1101  * headers. __register_atfork() is mostly equivalent to pthread_atfork(), but doesn't require us to link against
1102  * libpthread, as it is part of glibc anyway. */
1103 extern int __register_atfork(void (*prepare) (void), void (*parent) (void), void (*child) (void), void *dso_handle);
1104 extern void* __dso_handle _weak_;
1105
1106 pid_t getpid_cached(void) {
1107         static bool installed = false;
1108         pid_t current_value;
1109
1110         /* getpid_cached() is much like getpid(), but caches the value in local memory, to avoid having to invoke a
1111          * system call each time. This restores glibc behaviour from before 2.24, when getpid() was unconditionally
1112          * cached. Starting with 2.24 getpid() started to become prohibitively expensive when used for detecting when
1113          * objects were used across fork()s. With this caching the old behaviour is somewhat restored.
1114          *
1115          * https://bugzilla.redhat.com/show_bug.cgi?id=1443976
1116          * https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=c579f48edba88380635ab98cb612030e3ed8691e
1117          */
1118
1119         current_value = __sync_val_compare_and_swap(&cached_pid, CACHED_PID_UNSET, CACHED_PID_BUSY);
1120
1121         switch (current_value) {
1122
1123         case CACHED_PID_UNSET: { /* Not initialized yet, then do so now */
1124                 pid_t new_pid;
1125
1126                 new_pid = raw_getpid();
1127
1128                 if (!installed) {
1129                         /* __register_atfork() either returns 0 or -ENOMEM, in its glibc implementation. Since it's
1130                          * only half-documented (glibc doesn't document it but LSB does — though only superficially)
1131                          * we'll check for errors only in the most generic fashion possible. */
1132
1133                         if (__register_atfork(NULL, NULL, reset_cached_pid, __dso_handle) != 0) {
1134                                 /* OOM? Let's try again later */
1135                                 cached_pid = CACHED_PID_UNSET;
1136                                 return new_pid;
1137                         }
1138
1139                         installed = true;
1140                 }
1141
1142                 cached_pid = new_pid;
1143                 return new_pid;
1144         }
1145
1146         case CACHED_PID_BUSY: /* Somebody else is currently initializing */
1147                 return raw_getpid();
1148
1149         default: /* Properly initialized */
1150                 return current_value;
1151         }
1152 }
1153
1154 int must_be_root(void) {
1155
1156         if (geteuid() == 0)
1157                 return 0;
1158
1159         return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Need to be root.");
1160 }
1161
1162 int safe_fork_full(
1163                 const char *name,
1164                 const int except_fds[],
1165                 size_t n_except_fds,
1166                 ForkFlags flags,
1167                 pid_t *ret_pid) {
1168
1169         pid_t original_pid, pid;
1170         sigset_t saved_ss, ss;
1171         bool block_signals = false;
1172         int prio, r;
1173
1174         /* A wrapper around fork(), that does a couple of important initializations in addition to mere forking. Always
1175          * returns the child's PID in *ret_pid. Returns == 0 in the child, and > 0 in the parent. */
1176
1177         prio = flags & FORK_LOG ? LOG_ERR : LOG_DEBUG;
1178
1179         original_pid = getpid_cached();
1180
1181         if (flags & (FORK_RESET_SIGNALS|FORK_DEATHSIG)) {
1182                 /* We temporarily block all signals, so that the new child has them blocked initially. This way, we can
1183                  * be sure that SIGTERMs are not lost we might send to the child. */
1184
1185                 assert_se(sigfillset(&ss) >= 0);
1186                 block_signals = true;
1187
1188         } else if (flags & FORK_WAIT) {
1189                 /* Let's block SIGCHLD at least, so that we can safely watch for the child process */
1190
1191                 assert_se(sigemptyset(&ss) >= 0);
1192                 assert_se(sigaddset(&ss, SIGCHLD) >= 0);
1193                 block_signals = true;
1194         }
1195
1196         if (block_signals)
1197                 if (sigprocmask(SIG_SETMASK, &ss, &saved_ss) < 0)
1198                         return log_full_errno(prio, errno, "Failed to set signal mask: %m");
1199
1200         if (flags & FORK_NEW_MOUNTNS)
1201                 pid = raw_clone(SIGCHLD|CLONE_NEWNS);
1202         else
1203                 pid = fork();
1204         if (pid < 0) {
1205                 r = -errno;
1206
1207                 if (block_signals) /* undo what we did above */
1208                         (void) sigprocmask(SIG_SETMASK, &saved_ss, NULL);
1209
1210                 return log_full_errno(prio, r, "Failed to fork: %m");
1211         }
1212         if (pid > 0) {
1213                 /* We are in the parent process */
1214
1215                 log_debug("Successfully forked off '%s' as PID " PID_FMT ".", strna(name), pid);
1216
1217                 if (flags & FORK_WAIT) {
1218                         r = wait_for_terminate_and_check(name, pid, (flags & FORK_LOG ? WAIT_LOG : 0));
1219                         if (r < 0)
1220                                 return r;
1221                         if (r != EXIT_SUCCESS) /* exit status > 0 should be treated as failure, too */
1222                                 return -EPROTO;
1223                 }
1224
1225                 if (block_signals) /* undo what we did above */
1226                         (void) sigprocmask(SIG_SETMASK, &saved_ss, NULL);
1227
1228                 if (ret_pid)
1229                         *ret_pid = pid;
1230
1231                 return 1;
1232         }
1233
1234         /* We are in the child process */
1235
1236         if (flags & FORK_REOPEN_LOG) {
1237                 /* Close the logs if requested, before we log anything. And make sure we reopen it if needed. */
1238                 log_close();
1239                 log_set_open_when_needed(true);
1240         }
1241
1242         if (name) {
1243                 r = rename_process(name);
1244                 if (r < 0)
1245                         log_full_errno(flags & FORK_LOG ? LOG_WARNING : LOG_DEBUG,
1246                                        r, "Failed to rename process, ignoring: %m");
1247         }
1248
1249         if (flags & FORK_DEATHSIG)
1250                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0) {
1251                         log_full_errno(prio, errno, "Failed to set death signal: %m");
1252                         _exit(EXIT_FAILURE);
1253                 }
1254
1255         if (flags & FORK_RESET_SIGNALS) {
1256                 r = reset_all_signal_handlers();
1257                 if (r < 0) {
1258                         log_full_errno(prio, r, "Failed to reset signal handlers: %m");
1259                         _exit(EXIT_FAILURE);
1260                 }
1261
1262                 /* This implicitly undoes the signal mask stuff we did before the fork()ing above */
1263                 r = reset_signal_mask();
1264                 if (r < 0) {
1265                         log_full_errno(prio, r, "Failed to reset signal mask: %m");
1266                         _exit(EXIT_FAILURE);
1267                 }
1268         } else if (block_signals) { /* undo what we did above */
1269                 if (sigprocmask(SIG_SETMASK, &saved_ss, NULL) < 0) {
1270                         log_full_errno(prio, errno, "Failed to restore signal mask: %m");
1271                         _exit(EXIT_FAILURE);
1272                 }
1273         }
1274
1275         if (flags & FORK_DEATHSIG) {
1276                 pid_t ppid;
1277                 /* Let's see if the parent PID is still the one we started from? If not, then the parent
1278                  * already died by the time we set PR_SET_PDEATHSIG, hence let's emulate the effect */
1279
1280                 ppid = getppid();
1281                 if (ppid == 0)
1282                         /* Parent is in a differn't PID namespace. */;
1283                 else if (ppid != original_pid) {
1284                         log_debug("Parent died early, raising SIGTERM.");
1285                         (void) raise(SIGTERM);
1286                         _exit(EXIT_FAILURE);
1287                 }
1288         }
1289
1290         if (FLAGS_SET(flags, FORK_NEW_MOUNTNS | FORK_MOUNTNS_SLAVE)) {
1291
1292                 /* Optionally, make sure we never propagate mounts to the host. */
1293
1294                 if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) < 0) {
1295                         log_full_errno(prio, errno, "Failed to remount root directory as MS_SLAVE: %m");
1296                         _exit(EXIT_FAILURE);
1297                 }
1298         }
1299
1300         if (flags & FORK_CLOSE_ALL_FDS) {
1301                 /* Close the logs here in case it got reopened above, as close_all_fds() would close them for us */
1302                 log_close();
1303
1304                 r = close_all_fds(except_fds, n_except_fds);
1305                 if (r < 0) {
1306                         log_full_errno(prio, r, "Failed to close all file descriptors: %m");
1307                         _exit(EXIT_FAILURE);
1308                 }
1309         }
1310
1311         /* When we were asked to reopen the logs, do so again now */
1312         if (flags & FORK_REOPEN_LOG) {
1313                 log_open();
1314                 log_set_open_when_needed(false);
1315         }
1316
1317         if (flags & FORK_NULL_STDIO) {
1318                 r = make_null_stdio();
1319                 if (r < 0) {
1320                         log_full_errno(prio, r, "Failed to connect stdin/stdout to /dev/null: %m");
1321                         _exit(EXIT_FAILURE);
1322                 }
1323         }
1324
1325         if (flags & FORK_RLIMIT_NOFILE_SAFE) {
1326                 r = rlimit_nofile_safe();
1327                 if (r < 0) {
1328                         log_full_errno(prio, r, "Failed to lower RLIMIT_NOFILE's soft limit to 1K: %m");
1329                         _exit(EXIT_FAILURE);
1330                 }
1331         }
1332
1333         if (ret_pid)
1334                 *ret_pid = getpid_cached();
1335
1336         return 0;
1337 }
1338
1339 int namespace_fork(
1340                 const char *outer_name,
1341                 const char *inner_name,
1342                 const int except_fds[],
1343                 size_t n_except_fds,
1344                 ForkFlags flags,
1345                 int pidns_fd,
1346                 int mntns_fd,
1347                 int netns_fd,
1348                 int userns_fd,
1349                 int root_fd,
1350                 pid_t *ret_pid) {
1351
1352         int r;
1353
1354         /* This is much like safe_fork(), but forks twice, and joins the specified namespaces in the middle
1355          * process. This ensures that we are fully a member of the destination namespace, with pidns an all, so that
1356          * /proc/self/fd works correctly. */
1357
1358         r = safe_fork_full(outer_name, except_fds, n_except_fds, (flags|FORK_DEATHSIG) & ~(FORK_REOPEN_LOG|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE), ret_pid);
1359         if (r < 0)
1360                 return r;
1361         if (r == 0) {
1362                 pid_t pid;
1363
1364                 /* Child */
1365
1366                 r = namespace_enter(pidns_fd, mntns_fd, netns_fd, userns_fd, root_fd);
1367                 if (r < 0) {
1368                         log_full_errno(FLAGS_SET(flags, FORK_LOG) ? LOG_ERR : LOG_DEBUG, r, "Failed to join namespace: %m");
1369                         _exit(EXIT_FAILURE);
1370                 }
1371
1372                 /* We mask a few flags here that either make no sense for the grandchild, or that we don't have to do again */
1373                 r = safe_fork_full(inner_name, except_fds, n_except_fds, flags & ~(FORK_WAIT|FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_NULL_STDIO), &pid);
1374                 if (r < 0)
1375                         _exit(EXIT_FAILURE);
1376                 if (r == 0) {
1377                         /* Child */
1378                         if (ret_pid)
1379                                 *ret_pid = pid;
1380                         return 0;
1381                 }
1382
1383                 r = wait_for_terminate_and_check(inner_name, pid, FLAGS_SET(flags, FORK_LOG) ? WAIT_LOG : 0);
1384                 if (r < 0)
1385                         _exit(EXIT_FAILURE);
1386
1387                 _exit(r);
1388         }
1389
1390         return 1;
1391 }
1392
1393 int fork_agent(const char *name, const int except[], size_t n_except, pid_t *ret_pid, const char *path, ...) {
1394         bool stdout_is_tty, stderr_is_tty;
1395         size_t n, i;
1396         va_list ap;
1397         char **l;
1398         int r;
1399
1400         assert(path);
1401
1402         /* Spawns a temporary TTY agent, making sure it goes away when we go away */
1403
1404         r = safe_fork_full(name, except, n_except, FORK_RESET_SIGNALS|FORK_DEATHSIG|FORK_CLOSE_ALL_FDS, ret_pid);
1405         if (r < 0)
1406                 return r;
1407         if (r > 0)
1408                 return 0;
1409
1410         /* In the child: */
1411
1412         stdout_is_tty = isatty(STDOUT_FILENO);
1413         stderr_is_tty = isatty(STDERR_FILENO);
1414
1415         if (!stdout_is_tty || !stderr_is_tty) {
1416                 int fd;
1417
1418                 /* Detach from stdout/stderr. and reopen
1419                  * /dev/tty for them. This is important to
1420                  * ensure that when systemctl is started via
1421                  * popen() or a similar call that expects to
1422                  * read EOF we actually do generate EOF and
1423                  * not delay this indefinitely by because we
1424                  * keep an unused copy of stdin around. */
1425                 fd = open("/dev/tty", O_WRONLY);
1426                 if (fd < 0) {
1427                         log_error_errno(errno, "Failed to open /dev/tty: %m");
1428                         _exit(EXIT_FAILURE);
1429                 }
1430
1431                 if (!stdout_is_tty && dup2(fd, STDOUT_FILENO) < 0) {
1432                         log_error_errno(errno, "Failed to dup2 /dev/tty: %m");
1433                         _exit(EXIT_FAILURE);
1434                 }
1435
1436                 if (!stderr_is_tty && dup2(fd, STDERR_FILENO) < 0) {
1437                         log_error_errno(errno, "Failed to dup2 /dev/tty: %m");
1438                         _exit(EXIT_FAILURE);
1439                 }
1440
1441                 safe_close_above_stdio(fd);
1442         }
1443
1444         (void) rlimit_nofile_safe();
1445
1446         /* Count arguments */
1447         va_start(ap, path);
1448         for (n = 0; va_arg(ap, char*); n++)
1449                 ;
1450         va_end(ap);
1451
1452         /* Allocate strv */
1453         l = newa(char*, n + 1);
1454
1455         /* Fill in arguments */
1456         va_start(ap, path);
1457         for (i = 0; i <= n; i++)
1458                 l[i] = va_arg(ap, char*);
1459         va_end(ap);
1460
1461         execv(path, l);
1462         _exit(EXIT_FAILURE);
1463 }
1464
1465 int set_oom_score_adjust(int value) {
1466         char t[DECIMAL_STR_MAX(int)];
1467
1468         sprintf(t, "%i", value);
1469
1470         return write_string_file("/proc/self/oom_score_adj", t,
1471                                  WRITE_STRING_FILE_VERIFY_ON_FAILURE|WRITE_STRING_FILE_DISABLE_BUFFER);
1472 }
1473
1474 int cpus_in_affinity_mask(void) {
1475         size_t n = 16;
1476         int r;
1477
1478         for (;;) {
1479                 cpu_set_t *c;
1480
1481                 c = CPU_ALLOC(n);
1482                 if (!c)
1483                         return -ENOMEM;
1484
1485                 if (sched_getaffinity(0, CPU_ALLOC_SIZE(n), c) >= 0) {
1486                         int k;
1487
1488                         k = CPU_COUNT_S(CPU_ALLOC_SIZE(n), c);
1489                         CPU_FREE(c);
1490
1491                         if (k <= 0)
1492                                 return -EINVAL;
1493
1494                         return k;
1495                 }
1496
1497                 r = -errno;
1498                 CPU_FREE(c);
1499
1500                 if (r != -EINVAL)
1501                         return r;
1502                 if (n > SIZE_MAX/2)
1503                         return -ENOMEM;
1504                 n *= 2;
1505         }
1506 }
1507
1508 static const char *const ioprio_class_table[] = {
1509         [IOPRIO_CLASS_NONE] = "none",
1510         [IOPRIO_CLASS_RT] = "realtime",
1511         [IOPRIO_CLASS_BE] = "best-effort",
1512         [IOPRIO_CLASS_IDLE] = "idle"
1513 };
1514
1515 DEFINE_STRING_TABLE_LOOKUP_WITH_FALLBACK(ioprio_class, int, IOPRIO_N_CLASSES);
1516
1517 static const char *const sigchld_code_table[] = {
1518         [CLD_EXITED] = "exited",
1519         [CLD_KILLED] = "killed",
1520         [CLD_DUMPED] = "dumped",
1521         [CLD_TRAPPED] = "trapped",
1522         [CLD_STOPPED] = "stopped",
1523         [CLD_CONTINUED] = "continued",
1524 };
1525
1526 DEFINE_STRING_TABLE_LOOKUP(sigchld_code, int);
1527
1528 static const char* const sched_policy_table[] = {
1529         [SCHED_OTHER] = "other",
1530         [SCHED_BATCH] = "batch",
1531         [SCHED_IDLE] = "idle",
1532         [SCHED_FIFO] = "fifo",
1533         [SCHED_RR] = "rr"
1534 };
1535
1536 DEFINE_STRING_TABLE_LOOKUP_WITH_FALLBACK(sched_policy, int, INT_MAX);