src/core/execute.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <errno.h>
   4 #include <fcntl.h>
   5 #include <poll.h>
   6 #include <sys/eventfd.h>
   7 #include <sys/ioctl.h>
   8 #include <sys/mman.h>
   9 #include <sys/mount.h>
  10 #include <sys/personality.h>
  11 #include <sys/prctl.h>
  12 #include <sys/shm.h>
  13 #include <sys/types.h>
  14 #include <sys/un.h>
  15 #include <unistd.h>
  16 #include <utmpx.h>
  17
  18 #include <linux/fs.h> /* Must be included after <sys/mount.h> */
  19
  20 #if HAVE_PAM
  21 #include <security/pam_appl.h>
  22 #endif
  23
  24 #if HAVE_SELINUX
  25 #include <selinux/selinux.h>
  26 #endif
  27
  28 #if HAVE_SECCOMP
  29 #include <seccomp.h>
  30 #endif
  31
  32 #if HAVE_APPARMOR
  33 #include <sys/apparmor.h>
  34 #endif
  35
  36 #include "sd-messages.h"
  37
  38 #include "acl-util.h"
  39 #include "af-list.h"
  40 #include "alloc-util.h"
  41 #if HAVE_APPARMOR
  42 #include "apparmor-util.h"
  43 #endif
  44 #include "argv-util.h"
  45 #include "async.h"
  46 #include "barrier.h"
  47 #include "bpf-lsm.h"
  48 #include "btrfs-util.h"
  49 #include "cap-list.h"
  50 #include "capability-util.h"
  51 #include "chattr-util.h"
  52 #include "cgroup-setup.h"
  53 #include "chase.h"
  54 #include "chown-recursive.h"
  55 #include "constants.h"
  56 #include "cpu-set-util.h"
  57 #include "creds-util.h"
  58 #include "data-fd-util.h"
  59 #include "env-file.h"
  60 #include "env-util.h"
  61 #include "errno-list.h"
  62 #include "escape.h"
  63 #include "execute.h"
  64 #include "exit-status.h"
  65 #include "fd-util.h"
  66 #include "fileio.h"
  67 #include "format-util.h"
  68 #include "glob-util.h"
  69 #include "hexdecoct.h"
  70 #include "io-util.h"
  71 #include "ioprio-util.h"
  72 #include "label-util.h"
  73 #include "lock-util.h"
  74 #include "log.h"
  75 #include "macro.h"
  76 #include "manager.h"
  77 #include "manager-dump.h"
  78 #include "memory-util.h"
  79 #include "missing_fs.h"
  80 #include "missing_ioprio.h"
  81 #include "missing_prctl.h"
  82 #include "mkdir-label.h"
  83 #include "mount-util.h"
  84 #include "mountpoint-util.h"
  85 #include "namespace.h"
  86 #include "parse-util.h"
  87 #include "path-util.h"
  88 #include "proc-cmdline.h"
  89 #include "process-util.h"
  90 #include "psi-util.h"
  91 #include "random-util.h"
  92 #include "recurse-dir.h"
  93 #include "rlimit-util.h"
  94 #include "rm-rf.h"
  95 #if HAVE_SECCOMP
  96 #include "seccomp-util.h"
  97 #endif
  98 #include "securebits-util.h"
  99 #include "selinux-util.h"
 100 #include "signal-util.h"
 101 #include "smack-util.h"
 102 #include "socket-util.h"
 103 #include "sort-util.h"
 104 #include "special.h"
 105 #include "stat-util.h"
 106 #include "string-table.h"
 107 #include "string-util.h"
 108 #include "strv.h"
 109 #include "syslog-util.h"
 110 #include "terminal-util.h"
 111 #include "tmpfile-util.h"
 112 #include "umask-util.h"
 113 #include "unit-serialize.h"
 114 #include "user-util.h"
 115 #include "utmp-wtmp.h"
 116
 117 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
 118 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
 119
 120 #define SNDBUF_SIZE (8*1024*1024)
 121
 122 static int shift_fds(int fds[], size_t n_fds) {
 123         if (n_fds <= 0)
 124                 return 0;
 125
 126         /* Modifies the fds array! (sorts it) */
 127
 128         assert(fds);
 129
 130         for (int start = 0;;) {
 131                 int restart_from = -1;
 132
 133                 for (int i = start; i < (int) n_fds; i++) {
 134                         int nfd;
 135
 136                         /* Already at right index? */
 137                         if (fds[i] == i+3)
 138                                 continue;
 139
 140                         nfd = fcntl(fds[i], F_DUPFD, i + 3);
 141                         if (nfd < 0)
 142                                 return -errno;
 143
 144                         safe_close(fds[i]);
 145                         fds[i] = nfd;
 146
 147                         /* Hmm, the fd we wanted isn't free? Then
 148                          * let's remember that and try again from here */
 149                         if (nfd != i+3 && restart_from < 0)
 150                                 restart_from = i;
 151                 }
 152
 153                 if (restart_from < 0)
 154                         break;
 155
 156                 start = restart_from;
 157         }
 158
 159         return 0;
 160 }
 161
 162 static int flags_fds(
 163                 const int fds[],
 164                 size_t n_socket_fds,
 165                 size_t n_fds,
 166                 bool nonblock) {
 167
 168         int r;
 169
 170         if (n_fds <= 0)
 171                 return 0;
 172
 173         assert(fds);
 174
 175         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
 176          * O_NONBLOCK only applies to socket activation though. */
 177
 178         for (size_t i = 0; i < n_fds; i++) {
 179
 180                 if (i < n_socket_fds) {
 181                         r = fd_nonblock(fds[i], nonblock);
 182                         if (r < 0)
 183                                 return r;
 184                 }
 185
 186                 /* We unconditionally drop FD_CLOEXEC from the fds,
 187                  * since after all we want to pass these fds to our
 188                  * children */
 189
 190                 r = fd_cloexec(fds[i], false);
 191                 if (r < 0)
 192                         return r;
 193         }
 194
 195         return 0;
 196 }
 197
 198 static const char *exec_context_tty_path(const ExecContext *context) {
 199         assert(context);
 200
 201         if (context->stdio_as_fds)
 202                 return NULL;
 203
 204         if (context->tty_path)
 205                 return context->tty_path;
 206
 207         return "/dev/console";
 208 }
 209
 210 static int exec_context_tty_size(const ExecContext *context, unsigned *ret_rows, unsigned *ret_cols) {
 211         _cleanup_free_ char *rowskey = NULL, *rowsvalue = NULL, *colskey = NULL, *colsvalue = NULL;
 212         unsigned rows, cols;
 213         const char *tty;
 214         int r;
 215
 216         assert(context);
 217         assert(ret_rows);
 218         assert(ret_cols);
 219
 220         rows = context->tty_rows;
 221         cols = context->tty_cols;
 222
 223         tty = exec_context_tty_path(context);
 224         if (!tty || (rows != UINT_MAX && cols != UINT_MAX)) {
 225                 *ret_rows = rows;
 226                 *ret_cols = cols;
 227                 return 0;
 228         }
 229
 230         tty = skip_dev_prefix(tty);
 231         if (!in_charset(tty, ALPHANUMERICAL)) {
 232                 log_debug("%s contains non-alphanumeric characters, ignoring", tty);
 233                 *ret_rows = rows;
 234                 *ret_cols = cols;
 235                 return 0;
 236         }
 237
 238         rowskey = strjoin("systemd.tty.rows.", tty);
 239         if (!rowskey)
 240                 return -ENOMEM;
 241
 242         colskey = strjoin("systemd.tty.columns.", tty);
 243         if (!colskey)
 244                 return -ENOMEM;
 245
 246         r = proc_cmdline_get_key_many(/* flags = */ 0,
 247                                       rowskey, &rowsvalue,
 248                                       colskey, &colsvalue);
 249         if (r < 0)
 250                 log_debug_errno(r, "Failed to read TTY size of %s from kernel cmdline, ignoring: %m", tty);
 251
 252         if (rows == UINT_MAX && rowsvalue) {
 253                 r = safe_atou(rowsvalue, &rows);
 254                 if (r < 0)
 255                         log_debug_errno(r, "Failed to parse %s=%s, ignoring: %m", rowskey, rowsvalue);
 256         }
 257
 258         if (cols == UINT_MAX && colsvalue) {
 259                 r = safe_atou(colsvalue, &cols);
 260                 if (r < 0)
 261                         log_debug_errno(r, "Failed to parse %s=%s, ignoring: %m", colskey, colsvalue);
 262         }
 263
 264         *ret_rows = rows;
 265         *ret_cols = cols;
 266
 267         return 0;
 268 }
 269
 270 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
 271         const char *path;
 272
 273         assert(context);
 274
 275         path = exec_context_tty_path(context);
 276
 277         if (context->tty_vhangup) {
 278                 if (p && p->stdin_fd >= 0)
 279                         (void) terminal_vhangup_fd(p->stdin_fd);
 280                 else if (path)
 281                         (void) terminal_vhangup(path);
 282         }
 283
 284         if (context->tty_reset) {
 285                 if (p && p->stdin_fd >= 0)
 286                         (void) reset_terminal_fd(p->stdin_fd, true);
 287                 else if (path)
 288                         (void) reset_terminal(path);
 289         }
 290
 291         if (p && p->stdin_fd >= 0) {
 292                 unsigned rows = context->tty_rows, cols = context->tty_cols;
 293
 294                 (void) exec_context_tty_size(context, &rows, &cols);
 295                 (void) terminal_set_size_fd(p->stdin_fd, path, rows, cols);
 296         }
 297
 298         if (context->tty_vt_disallocate && path)
 299                 (void) vt_disallocate(path);
 300 }
 301
 302 static bool is_terminal_input(ExecInput i) {
 303         return IN_SET(i,
 304                       EXEC_INPUT_TTY,
 305                       EXEC_INPUT_TTY_FORCE,
 306                       EXEC_INPUT_TTY_FAIL);
 307 }
 308
 309 static bool is_terminal_output(ExecOutput o) {
 310         return IN_SET(o,
 311                       EXEC_OUTPUT_TTY,
 312                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 313                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 314 }
 315
 316 static bool is_kmsg_output(ExecOutput o) {
 317         return IN_SET(o,
 318                       EXEC_OUTPUT_KMSG,
 319                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 320 }
 321
 322 static bool exec_context_needs_term(const ExecContext *c) {
 323         assert(c);
 324
 325         /* Return true if the execution context suggests we should set $TERM to something useful. */
 326
 327         if (is_terminal_input(c->std_input))
 328                 return true;
 329
 330         if (is_terminal_output(c->std_output))
 331                 return true;
 332
 333         if (is_terminal_output(c->std_error))
 334                 return true;
 335
 336         return !!c->tty_path;
 337 }
 338
 339 static int open_null_as(int flags, int nfd) {
 340         int fd;
 341
 342         assert(nfd >= 0);
 343
 344         fd = open("/dev/null", flags|O_NOCTTY);
 345         if (fd < 0)
 346                 return -errno;
 347
 348         return move_fd(fd, nfd, false);
 349 }
 350
 351 static int connect_journal_socket(
 352                 int fd,
 353                 const char *log_namespace,
 354                 uid_t uid,
 355                 gid_t gid) {
 356
 357         uid_t olduid = UID_INVALID;
 358         gid_t oldgid = GID_INVALID;
 359         const char *j;
 360         int r;
 361
 362         j = log_namespace ?
 363                 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
 364                 "/run/systemd/journal/stdout";
 365
 366         if (gid_is_valid(gid)) {
 367                 oldgid = getgid();
 368
 369                 if (setegid(gid) < 0)
 370                         return -errno;
 371         }
 372
 373         if (uid_is_valid(uid)) {
 374                 olduid = getuid();
 375
 376                 if (seteuid(uid) < 0) {
 377                         r = -errno;
 378                         goto restore_gid;
 379                 }
 380         }
 381
 382         r = connect_unix_path(fd, AT_FDCWD, j);
 383
 384         /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
 385            an LSM interferes. */
 386
 387         if (uid_is_valid(uid))
 388                 (void) seteuid(olduid);
 389
 390  restore_gid:
 391         if (gid_is_valid(gid))
 392                 (void) setegid(oldgid);
 393
 394         return r;
 395 }
 396
 397 static int connect_logger_as(
 398                 const Unit *unit,
 399                 const ExecContext *context,
 400                 const ExecParameters *params,
 401                 ExecOutput output,
 402                 const char *ident,
 403                 int nfd,
 404                 uid_t uid,
 405                 gid_t gid) {
 406
 407         _cleanup_close_ int fd = -EBADF;
 408         int r;
 409
 410         assert(context);
 411         assert(params);
 412         assert(output < _EXEC_OUTPUT_MAX);
 413         assert(ident);
 414         assert(nfd >= 0);
 415
 416         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 417         if (fd < 0)
 418                 return -errno;
 419
 420         r = connect_journal_socket(fd, context->log_namespace, uid, gid);
 421         if (r < 0)
 422                 return r;
 423
 424         if (shutdown(fd, SHUT_RD) < 0)
 425                 return -errno;
 426
 427         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 428
 429         if (dprintf(fd,
 430                 "%s\n"
 431                 "%s\n"
 432                 "%i\n"
 433                 "%i\n"
 434                 "%i\n"
 435                 "%i\n"
 436                 "%i\n",
 437                 context->syslog_identifier ?: ident,
 438                 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
 439                 context->syslog_priority,
 440                 !!context->syslog_level_prefix,
 441                 false,
 442                 is_kmsg_output(output),
 443                 is_terminal_output(output)) < 0)
 444                 return -errno;
 445
 446         return move_fd(TAKE_FD(fd), nfd, false);
 447 }
 448
 449 static int open_terminal_as(const char *path, int flags, int nfd) {
 450         int fd;
 451
 452         assert(path);
 453         assert(nfd >= 0);
 454
 455         fd = open_terminal(path, flags | O_NOCTTY);
 456         if (fd < 0)
 457                 return fd;
 458
 459         return move_fd(fd, nfd, false);
 460 }
 461
 462 static int acquire_path(const char *path, int flags, mode_t mode) {
 463         _cleanup_close_ int fd = -EBADF;
 464         int r;
 465
 466         assert(path);
 467
 468         if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
 469                 flags |= O_CREAT;
 470
 471         fd = open(path, flags|O_NOCTTY, mode);
 472         if (fd >= 0)
 473                 return TAKE_FD(fd);
 474
 475         if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
 476                 return -errno;
 477
 478         /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
 479
 480         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 481         if (fd < 0)
 482                 return -errno;
 483
 484         r = connect_unix_path(fd, AT_FDCWD, path);
 485         if (IN_SET(r, -ENOTSOCK, -EINVAL))
 486                 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
 487                  * wasn't an AF_UNIX socket after all */
 488                 return -ENXIO;
 489         if (r < 0)
 490                 return r;
 491
 492         if ((flags & O_ACCMODE) == O_RDONLY)
 493                 r = shutdown(fd, SHUT_WR);
 494         else if ((flags & O_ACCMODE) == O_WRONLY)
 495                 r = shutdown(fd, SHUT_RD);
 496         else
 497                 r = 0;
 498         if (r < 0)
 499                 return -errno;
 500
 501         return TAKE_FD(fd);
 502 }
 503
 504 static int fixup_input(
 505                 const ExecContext *context,
 506                 int socket_fd,
 507                 bool apply_tty_stdin) {
 508
 509         ExecInput std_input;
 510
 511         assert(context);
 512
 513         std_input = context->std_input;
 514
 515         if (is_terminal_input(std_input) && !apply_tty_stdin)
 516                 return EXEC_INPUT_NULL;
 517
 518         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 519                 return EXEC_INPUT_NULL;
 520
 521         if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
 522                 return EXEC_INPUT_NULL;
 523
 524         return std_input;
 525 }
 526
 527 static int fixup_output(ExecOutput output, int socket_fd) {
 528
 529         if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 530                 return EXEC_OUTPUT_INHERIT;
 531
 532         return output;
 533 }
 534
 535 static int setup_input(
 536                 const ExecContext *context,
 537                 const ExecParameters *params,
 538                 int socket_fd,
 539                 const int named_iofds[static 3]) {
 540
 541         ExecInput i;
 542         int r;
 543
 544         assert(context);
 545         assert(params);
 546         assert(named_iofds);
 547
 548         if (params->stdin_fd >= 0) {
 549                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 550                         return -errno;
 551
 552                 /* Try to make this the controlling tty, if it is a tty, and reset it */
 553                 if (isatty(STDIN_FILENO)) {
 554                         unsigned rows = context->tty_rows, cols = context->tty_cols;
 555
 556                         (void) exec_context_tty_size(context, &rows, &cols);
 557                         (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 558                         (void) reset_terminal_fd(STDIN_FILENO, true);
 559                         (void) terminal_set_size_fd(STDIN_FILENO, NULL, rows, cols);
 560                 }
 561
 562                 return STDIN_FILENO;
 563         }
 564
 565         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 566
 567         switch (i) {
 568
 569         case EXEC_INPUT_NULL:
 570                 return open_null_as(O_RDONLY, STDIN_FILENO);
 571
 572         case EXEC_INPUT_TTY:
 573         case EXEC_INPUT_TTY_FORCE:
 574         case EXEC_INPUT_TTY_FAIL: {
 575                 unsigned rows, cols;
 576                 int fd;
 577
 578                 fd = acquire_terminal(exec_context_tty_path(context),
 579                                       i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
 580                                       i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
 581                                                                   ACQUIRE_TERMINAL_WAIT,
 582                                       USEC_INFINITY);
 583                 if (fd < 0)
 584                         return fd;
 585
 586                 r = exec_context_tty_size(context, &rows, &cols);
 587                 if (r < 0)
 588                         return r;
 589
 590                 r = terminal_set_size_fd(fd, exec_context_tty_path(context), rows, cols);
 591                 if (r < 0)
 592                         return r;
 593
 594                 return move_fd(fd, STDIN_FILENO, false);
 595         }
 596
 597         case EXEC_INPUT_SOCKET:
 598                 assert(socket_fd >= 0);
 599
 600                 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
 601
 602         case EXEC_INPUT_NAMED_FD:
 603                 assert(named_iofds[STDIN_FILENO] >= 0);
 604
 605                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 606                 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
 607
 608         case EXEC_INPUT_DATA: {
 609                 int fd;
 610
 611                 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
 612                 if (fd < 0)
 613                         return fd;
 614
 615                 return move_fd(fd, STDIN_FILENO, false);
 616         }
 617
 618         case EXEC_INPUT_FILE: {
 619                 bool rw;
 620                 int fd;
 621
 622                 assert(context->stdio_file[STDIN_FILENO]);
 623
 624                 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
 625                         (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
 626
 627                 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
 628                 if (fd < 0)
 629                         return fd;
 630
 631                 return move_fd(fd, STDIN_FILENO, false);
 632         }
 633
 634         default:
 635                 assert_not_reached();
 636         }
 637 }
 638
 639 static bool can_inherit_stderr_from_stdout(
 640                 const ExecContext *context,
 641                 ExecOutput o,
 642                 ExecOutput e) {
 643
 644         assert(context);
 645
 646         /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
 647          * stderr fd */
 648
 649         if (e == EXEC_OUTPUT_INHERIT)
 650                 return true;
 651         if (e != o)
 652                 return false;
 653
 654         if (e == EXEC_OUTPUT_NAMED_FD)
 655                 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
 656
 657         if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
 658                 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
 659
 660         return true;
 661 }
 662
 663 static int setup_output(
 664                 const Unit *unit,
 665                 const ExecContext *context,
 666                 const ExecParameters *params,
 667                 int fileno,
 668                 int socket_fd,
 669                 const int named_iofds[static 3],
 670                 const char *ident,
 671                 uid_t uid,
 672                 gid_t gid,
 673                 dev_t *journal_stream_dev,
 674                 ino_t *journal_stream_ino) {
 675
 676         ExecOutput o;
 677         ExecInput i;
 678         int r;
 679
 680         assert(unit);
 681         assert(context);
 682         assert(params);
 683         assert(ident);
 684         assert(journal_stream_dev);
 685         assert(journal_stream_ino);
 686
 687         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 688
 689                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 690                         return -errno;
 691
 692                 return STDOUT_FILENO;
 693         }
 694
 695         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 696                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 697                         return -errno;
 698
 699                 return STDERR_FILENO;
 700         }
 701
 702         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 703         o = fixup_output(context->std_output, socket_fd);
 704
 705         if (fileno == STDERR_FILENO) {
 706                 ExecOutput e;
 707                 e = fixup_output(context->std_error, socket_fd);
 708
 709                 /* This expects the input and output are already set up */
 710
 711                 /* Don't change the stderr file descriptor if we inherit all
 712                  * the way and are not on a tty */
 713                 if (e == EXEC_OUTPUT_INHERIT &&
 714                     o == EXEC_OUTPUT_INHERIT &&
 715                     i == EXEC_INPUT_NULL &&
 716                     !is_terminal_input(context->std_input) &&
 717                     getppid() != 1)
 718                         return fileno;
 719
 720                 /* Duplicate from stdout if possible */
 721                 if (can_inherit_stderr_from_stdout(context, o, e))
 722                         return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
 723
 724                 o = e;
 725
 726         } else if (o == EXEC_OUTPUT_INHERIT) {
 727                 /* If input got downgraded, inherit the original value */
 728                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 729                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 730
 731                 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
 732                 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
 733                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 734
 735                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 736                 if (getppid() != 1)
 737                         return fileno;
 738
 739                 /* We need to open /dev/null here anew, to get the right access mode. */
 740                 return open_null_as(O_WRONLY, fileno);
 741         }
 742
 743         switch (o) {
 744
 745         case EXEC_OUTPUT_NULL:
 746                 return open_null_as(O_WRONLY, fileno);
 747
 748         case EXEC_OUTPUT_TTY:
 749                 if (is_terminal_input(i))
 750                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 751
 752                 /* We don't reset the terminal if this is just about output */
 753                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 754
 755         case EXEC_OUTPUT_KMSG:
 756         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 757         case EXEC_OUTPUT_JOURNAL:
 758         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 759                 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
 760                 if (r < 0) {
 761                         log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m",
 762                                                fileno == STDOUT_FILENO ? "stdout" : "stderr");
 763                         r = open_null_as(O_WRONLY, fileno);
 764                 } else {
 765                         struct stat st;
 766
 767                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 768                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 769                          * services to detect whether they are connected to the journal or not.
 770                          *
 771                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
 772                          * about STDERR as that's usually the best way to do logging. */
 773
 774                         if (fstat(fileno, &st) >= 0 &&
 775                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
 776                                 *journal_stream_dev = st.st_dev;
 777                                 *journal_stream_ino = st.st_ino;
 778                         }
 779                 }
 780                 return r;
 781
 782         case EXEC_OUTPUT_SOCKET:
 783                 assert(socket_fd >= 0);
 784
 785                 return RET_NERRNO(dup2(socket_fd, fileno));
 786
 787         case EXEC_OUTPUT_NAMED_FD:
 788                 assert(named_iofds[fileno] >= 0);
 789
 790                 (void) fd_nonblock(named_iofds[fileno], false);
 791                 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
 792
 793         case EXEC_OUTPUT_FILE:
 794         case EXEC_OUTPUT_FILE_APPEND:
 795         case EXEC_OUTPUT_FILE_TRUNCATE: {
 796                 bool rw;
 797                 int fd, flags;
 798
 799                 assert(context->stdio_file[fileno]);
 800
 801                 rw = context->std_input == EXEC_INPUT_FILE &&
 802                         streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
 803
 804                 if (rw)
 805                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 806
 807                 flags = O_WRONLY;
 808                 if (o == EXEC_OUTPUT_FILE_APPEND)
 809                         flags |= O_APPEND;
 810                 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
 811                         flags |= O_TRUNC;
 812
 813                 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
 814                 if (fd < 0)
 815                         return fd;
 816
 817                 return move_fd(fd, fileno, 0);
 818         }
 819
 820         default:
 821                 assert_not_reached();
 822         }
 823 }
 824
 825 static int chown_terminal(int fd, uid_t uid) {
 826         int r;
 827
 828         assert(fd >= 0);
 829
 830         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 831         if (isatty(fd) < 1) {
 832                 if (IN_SET(errno, EINVAL, ENOTTY))
 833                         return 0; /* not a tty */
 834
 835                 return -errno;
 836         }
 837
 838         /* This might fail. What matters are the results. */
 839         r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
 840         if (r < 0)
 841                 return r;
 842
 843         return 1;
 844 }
 845
 846 static int setup_confirm_stdio(
 847                 const ExecContext *context,
 848                 const char *vc,
 849                 int *ret_saved_stdin,
 850                 int *ret_saved_stdout) {
 851
 852         _cleanup_close_ int fd = -EBADF, saved_stdin = -EBADF, saved_stdout = -EBADF;
 853         unsigned rows, cols;
 854         int r;
 855
 856         assert(ret_saved_stdin);
 857         assert(ret_saved_stdout);
 858
 859         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 860         if (saved_stdin < 0)
 861                 return -errno;
 862
 863         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
 864         if (saved_stdout < 0)
 865                 return -errno;
 866
 867         fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
 868         if (fd < 0)
 869                 return fd;
 870
 871         r = chown_terminal(fd, getuid());
 872         if (r < 0)
 873                 return r;
 874
 875         r = reset_terminal_fd(fd, true);
 876         if (r < 0)
 877                 return r;
 878
 879         r = exec_context_tty_size(context, &rows, &cols);
 880         if (r < 0)
 881                 return r;
 882
 883         r = terminal_set_size_fd(fd, vc, rows, cols);
 884         if (r < 0)
 885                 return r;
 886
 887         r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
 888         TAKE_FD(fd);
 889         if (r < 0)
 890                 return r;
 891
 892         *ret_saved_stdin = TAKE_FD(saved_stdin);
 893         *ret_saved_stdout = TAKE_FD(saved_stdout);
 894         return 0;
 895 }
 896
 897 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
 898         assert(err < 0);
 899
 900         if (err == -ETIMEDOUT)
 901                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
 902         else {
 903                 errno = -err;
 904                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
 905         }
 906 }
 907
 908 static void write_confirm_error(int err, const char *vc, const Unit *u) {
 909         _cleanup_close_ int fd = -EBADF;
 910
 911         assert(vc);
 912
 913         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 914         if (fd < 0)
 915                 return;
 916
 917         write_confirm_error_fd(err, fd, u);
 918 }
 919
 920 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 921         int r = 0;
 922
 923         assert(saved_stdin);
 924         assert(saved_stdout);
 925
 926         release_terminal();
 927
 928         if (*saved_stdin >= 0)
 929                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 930                         r = -errno;
 931
 932         if (*saved_stdout >= 0)
 933                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 934                         r = -errno;
 935
 936         *saved_stdin = safe_close(*saved_stdin);
 937         *saved_stdout = safe_close(*saved_stdout);
 938
 939         return r;
 940 }
 941
 942 enum {
 943         CONFIRM_PRETEND_FAILURE = -1,
 944         CONFIRM_PRETEND_SUCCESS =  0,
 945         CONFIRM_EXECUTE = 1,
 946 };
 947
 948 static int ask_for_confirmation(const ExecContext *context, const char *vc, Unit *u, const char *cmdline) {
 949         int saved_stdout = -1, saved_stdin = -1, r;
 950         _cleanup_free_ char *e = NULL;
 951         char c;
 952
 953         /* For any internal errors, assume a positive response. */
 954         r = setup_confirm_stdio(context, vc, &saved_stdin, &saved_stdout);
 955         if (r < 0) {
 956                 write_confirm_error(r, vc, u);
 957                 return CONFIRM_EXECUTE;
 958         }
 959
 960         /* confirm_spawn might have been disabled while we were sleeping. */
 961         if (manager_is_confirm_spawn_disabled(u->manager)) {
 962                 r = 1;
 963                 goto restore_stdio;
 964         }
 965
 966         e = ellipsize(cmdline, 60, 100);
 967         if (!e) {
 968                 log_oom();
 969                 r = CONFIRM_EXECUTE;
 970                 goto restore_stdio;
 971         }
 972
 973         for (;;) {
 974                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 975                 if (r < 0) {
 976                         write_confirm_error_fd(r, STDOUT_FILENO, u);
 977                         r = CONFIRM_EXECUTE;
 978                         goto restore_stdio;
 979                 }
 980
 981                 switch (c) {
 982                 case 'c':
 983                         printf("Resuming normal execution.\n");
 984                         manager_disable_confirm_spawn();
 985                         r = 1;
 986                         break;
 987                 case 'D':
 988                         unit_dump(u, stdout, "  ");
 989                         continue; /* ask again */
 990                 case 'f':
 991                         printf("Failing execution.\n");
 992                         r = CONFIRM_PRETEND_FAILURE;
 993                         break;
 994                 case 'h':
 995                         printf("  c - continue, proceed without asking anymore\n"
 996                                "  D - dump, show the state of the unit\n"
 997                                "  f - fail, don't execute the command and pretend it failed\n"
 998                                "  h - help\n"
 999                                "  i - info, show a short summary of the unit\n"
1000                                "  j - jobs, show jobs that are in progress\n"
1001                                "  s - skip, don't execute the command and pretend it succeeded\n"
1002                                "  y - yes, execute the command\n");
1003                         continue; /* ask again */
1004                 case 'i':
1005                         printf("  Description: %s\n"
1006                                "  Unit:        %s\n"
1007                                "  Command:     %s\n",
1008                                u->id, u->description, cmdline);
1009                         continue; /* ask again */
1010                 case 'j':
1011                         manager_dump_jobs(u->manager, stdout, /* patterns= */ NULL, "  ");
1012                         continue; /* ask again */
1013                 case 'n':
1014                         /* 'n' was removed in favor of 'f'. */
1015                         printf("Didn't understand 'n', did you mean 'f'?\n");
1016                         continue; /* ask again */
1017                 case 's':
1018                         printf("Skipping execution.\n");
1019                         r = CONFIRM_PRETEND_SUCCESS;
1020                         break;
1021                 case 'y':
1022                         r = CONFIRM_EXECUTE;
1023                         break;
1024                 default:
1025                         assert_not_reached();
1026                 }
1027                 break;
1028         }
1029
1030 restore_stdio:
1031         restore_confirm_stdio(&saved_stdin, &saved_stdout);
1032         return r;
1033 }
1034
1035 static int get_fixed_user(const ExecContext *c, const char **user,
1036                           uid_t *uid, gid_t *gid,
1037                           const char **home, const char **shell) {
1038         int r;
1039         const char *name;
1040
1041         assert(c);
1042
1043         if (!c->user)
1044                 return 0;
1045
1046         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
1047          * (i.e. are "/" or "/bin/nologin"). */
1048
1049         name = c->user;
1050         r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
1051         if (r < 0)
1052                 return r;
1053
1054         *user = name;
1055         return 0;
1056 }
1057
1058 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
1059         int r;
1060         const char *name;
1061
1062         assert(c);
1063
1064         if (!c->group)
1065                 return 0;
1066
1067         name = c->group;
1068         r = get_group_creds(&name, gid, 0);
1069         if (r < 0)
1070                 return r;
1071
1072         *group = name;
1073         return 0;
1074 }
1075
1076 static int get_supplementary_groups(const ExecContext *c, const char *user,
1077                                     const char *group, gid_t gid,
1078                                     gid_t **supplementary_gids, int *ngids) {
1079         int r, k = 0;
1080         int ngroups_max;
1081         bool keep_groups = false;
1082         gid_t *groups = NULL;
1083         _cleanup_free_ gid_t *l_gids = NULL;
1084
1085         assert(c);
1086
1087         /*
1088          * If user is given, then lookup GID and supplementary groups list.
1089          * We avoid NSS lookups for gid=0. Also we have to initialize groups
1090          * here and as early as possible so we keep the list of supplementary
1091          * groups of the caller.
1092          */
1093         if (user && gid_is_valid(gid) && gid != 0) {
1094                 /* First step, initialize groups from /etc/groups */
1095                 if (initgroups(user, gid) < 0)
1096                         return -errno;
1097
1098                 keep_groups = true;
1099         }
1100
1101         if (strv_isempty(c->supplementary_groups))
1102                 return 0;
1103
1104         /*
1105          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1106          * be positive, otherwise fail.
1107          */
1108         errno = 0;
1109         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1110         if (ngroups_max <= 0)
1111                 return errno_or_else(EOPNOTSUPP);
1112
1113         l_gids = new(gid_t, ngroups_max);
1114         if (!l_gids)
1115                 return -ENOMEM;
1116
1117         if (keep_groups) {
1118                 /*
1119                  * Lookup the list of groups that the user belongs to, we
1120                  * avoid NSS lookups here too for gid=0.
1121                  */
1122                 k = ngroups_max;
1123                 if (getgrouplist(user, gid, l_gids, &k) < 0)
1124                         return -EINVAL;
1125         } else
1126                 k = 0;
1127
1128         STRV_FOREACH(i, c->supplementary_groups) {
1129                 const char *g;
1130
1131                 if (k >= ngroups_max)
1132                         return -E2BIG;
1133
1134                 g = *i;
1135                 r = get_group_creds(&g, l_gids+k, 0);
1136                 if (r < 0)
1137                         return r;
1138
1139                 k++;
1140         }
1141
1142         /*
1143          * Sets ngids to zero to drop all supplementary groups, happens
1144          * when we are under root and SupplementaryGroups= is empty.
1145          */
1146         if (k == 0) {
1147                 *ngids = 0;
1148                 return 0;
1149         }
1150
1151         /* Otherwise get the final list of supplementary groups */
1152         groups = memdup(l_gids, sizeof(gid_t) * k);
1153         if (!groups)
1154                 return -ENOMEM;
1155
1156         *supplementary_gids = groups;
1157         *ngids = k;
1158
1159         groups = NULL;
1160
1161         return 0;
1162 }
1163
1164 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1165         int r;
1166
1167         /* Handle SupplementaryGroups= if it is not empty */
1168         if (ngids > 0) {
1169                 r = maybe_setgroups(ngids, supplementary_gids);
1170                 if (r < 0)
1171                         return r;
1172         }
1173
1174         if (gid_is_valid(gid)) {
1175                 /* Then set our gids */
1176                 if (setresgid(gid, gid, gid) < 0)
1177                         return -errno;
1178         }
1179
1180         return 0;
1181 }
1182
1183 static int set_securebits(unsigned bits, unsigned mask) {
1184         unsigned applied;
1185         int current;
1186
1187         current = prctl(PR_GET_SECUREBITS);
1188         if (current < 0)
1189                 return -errno;
1190
1191         /* Clear all securebits defined in mask and set bits */
1192         applied = ((unsigned) current & ~mask) | bits;
1193         if ((unsigned) current == applied)
1194                 return 0;
1195
1196         if (prctl(PR_SET_SECUREBITS, applied) < 0)
1197                 return -errno;
1198
1199         return 1;
1200 }
1201
1202 static int enforce_user(
1203                 const ExecContext *context,
1204                 uid_t uid,
1205                 uint64_t capability_ambient_set) {
1206         assert(context);
1207         int r;
1208
1209         if (!uid_is_valid(uid))
1210                 return 0;
1211
1212         /* Sets (but doesn't look up) the UIS and makes sure we keep the capabilities while doing so. For
1213          * setting secure bits the capability CAP_SETPCAP is required, so we also need keep-caps in this
1214          * case. */
1215
1216         if ((capability_ambient_set != 0 || context->secure_bits != 0) && uid != 0) {
1217
1218                 /* First step: If we need to keep capabilities but drop privileges we need to make sure we
1219                  * keep our caps, while we drop privileges. Add KEEP_CAPS to the securebits */
1220                 r = set_securebits(1U << SECURE_KEEP_CAPS, 0);
1221                 if (r < 0)
1222                         return r;
1223         }
1224
1225         /* Second step: actually set the uids */
1226         if (setresuid(uid, uid, uid) < 0)
1227                 return -errno;
1228
1229         /* At this point we should have all necessary capabilities but are otherwise a normal user. However,
1230          * the caps might got corrupted due to the setresuid() so we need clean them up later. This is done
1231          * outside of this call. */
1232         return 0;
1233 }
1234
1235 #if HAVE_PAM
1236
1237 static int null_conv(
1238                 int num_msg,
1239                 const struct pam_message **msg,
1240                 struct pam_response **resp,
1241                 void *appdata_ptr) {
1242
1243         /* We don't support conversations */
1244
1245         return PAM_CONV_ERR;
1246 }
1247
1248 #endif
1249
1250 static int setup_pam(
1251                 const char *name,
1252                 const char *user,
1253                 uid_t uid,
1254                 gid_t gid,
1255                 const char *tty,
1256                 char ***env, /* updated on success */
1257                 const int fds[], size_t n_fds) {
1258
1259 #if HAVE_PAM
1260
1261         static const struct pam_conv conv = {
1262                 .conv = null_conv,
1263                 .appdata_ptr = NULL
1264         };
1265
1266         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1267         _cleanup_strv_free_ char **e = NULL;
1268         pam_handle_t *handle = NULL;
1269         sigset_t old_ss;
1270         int pam_code = PAM_SUCCESS, r;
1271         bool close_session = false;
1272         pid_t pam_pid = 0, parent_pid;
1273         int flags = 0;
1274
1275         assert(name);
1276         assert(user);
1277         assert(env);
1278
1279         /* We set up PAM in the parent process, then fork. The child
1280          * will then stay around until killed via PR_GET_PDEATHSIG or
1281          * systemd via the cgroup logic. It will then remove the PAM
1282          * session again. The parent process will exec() the actual
1283          * daemon. We do things this way to ensure that the main PID
1284          * of the daemon is the one we initially fork()ed. */
1285
1286         r = barrier_create(&barrier);
1287         if (r < 0)
1288                 goto fail;
1289
1290         if (log_get_max_level() < LOG_DEBUG)
1291                 flags |= PAM_SILENT;
1292
1293         pam_code = pam_start(name, user, &conv, &handle);
1294         if (pam_code != PAM_SUCCESS) {
1295                 handle = NULL;
1296                 goto fail;
1297         }
1298
1299         if (!tty) {
1300                 _cleanup_free_ char *q = NULL;
1301
1302                 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1303                  * out if that's the case, and read the TTY off it. */
1304
1305                 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1306                         tty = strjoina("/dev/", q);
1307         }
1308
1309         if (tty) {
1310                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1311                 if (pam_code != PAM_SUCCESS)
1312                         goto fail;
1313         }
1314
1315         STRV_FOREACH(nv, *env) {
1316                 pam_code = pam_putenv(handle, *nv);
1317                 if (pam_code != PAM_SUCCESS)
1318                         goto fail;
1319         }
1320
1321         pam_code = pam_acct_mgmt(handle, flags);
1322         if (pam_code != PAM_SUCCESS)
1323                 goto fail;
1324
1325         pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1326         if (pam_code != PAM_SUCCESS)
1327                 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
1328
1329         pam_code = pam_open_session(handle, flags);
1330         if (pam_code != PAM_SUCCESS)
1331                 goto fail;
1332
1333         close_session = true;
1334
1335         e = pam_getenvlist(handle);
1336         if (!e) {
1337                 pam_code = PAM_BUF_ERR;
1338                 goto fail;
1339         }
1340
1341         /* Block SIGTERM, so that we know that it won't get lost in the child */
1342
1343         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1344
1345         parent_pid = getpid_cached();
1346
1347         r = safe_fork("(sd-pam)", 0, &pam_pid);
1348         if (r < 0)
1349                 goto fail;
1350         if (r == 0) {
1351                 int sig, ret = EXIT_PAM;
1352
1353                 /* The child's job is to reset the PAM session on termination */
1354                 barrier_set_role(&barrier, BARRIER_CHILD);
1355
1356                 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1357                  * those fds are open here that have been opened by PAM. */
1358                 (void) close_many(fds, n_fds);
1359
1360                 /* Drop privileges - we don't need any to pam_close_session and this will make
1361                  * PR_SET_PDEATHSIG work in most cases.  If this fails, ignore the error - but expect sd-pam
1362                  * threads to fail to exit normally */
1363
1364                 r = maybe_setgroups(0, NULL);
1365                 if (r < 0)
1366                         log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1367                 if (setresgid(gid, gid, gid) < 0)
1368                         log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1369                 if (setresuid(uid, uid, uid) < 0)
1370                         log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1371
1372                 (void) ignore_signals(SIGPIPE);
1373
1374                 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1375                  * otherwise the kernel will not allow unprivileged parents kill their privileged children
1376                  * this way. We rely on the control groups kill logic to do the rest for us. */
1377                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1378                         goto child_finish;
1379
1380                 /* Tell the parent that our setup is done. This is especially important regarding dropping
1381                  * privileges. Otherwise, unit setup might race against our setresuid(2) call.
1382                  *
1383                  * If the parent aborted, we'll detect this below, hence ignore return failure here. */
1384                 (void) barrier_place(&barrier);
1385
1386                 /* Check if our parent process might already have died? */
1387                 if (getppid() == parent_pid) {
1388                         sigset_t ss;
1389
1390                         assert_se(sigemptyset(&ss) >= 0);
1391                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1392
1393                         for (;;) {
1394                                 if (sigwait(&ss, &sig) < 0) {
1395                                         if (errno == EINTR)
1396                                                 continue;
1397
1398                                         goto child_finish;
1399                                 }
1400
1401                                 assert(sig == SIGTERM);
1402                                 break;
1403                         }
1404                 }
1405
1406                 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1407                 if (pam_code != PAM_SUCCESS)
1408                         goto child_finish;
1409
1410                 /* If our parent died we'll end the session */
1411                 if (getppid() != parent_pid) {
1412                         pam_code = pam_close_session(handle, flags);
1413                         if (pam_code != PAM_SUCCESS)
1414                                 goto child_finish;
1415                 }
1416
1417                 ret = 0;
1418
1419         child_finish:
1420                 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1421                  * know about this. See pam_end(3) */
1422                 (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
1423                 _exit(ret);
1424         }
1425
1426         barrier_set_role(&barrier, BARRIER_PARENT);
1427
1428         /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1429          * here. */
1430         handle = NULL;
1431
1432         /* Unblock SIGTERM again in the parent */
1433         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1434
1435         /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1436          * this fd around. */
1437         closelog();
1438
1439         /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1440          * recover. However, warn loudly if it happens. */
1441         if (!barrier_place_and_sync(&barrier))
1442                 log_error("PAM initialization failed");
1443
1444         return strv_free_and_replace(*env, e);
1445
1446 fail:
1447         if (pam_code != PAM_SUCCESS) {
1448                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1449                 r = -EPERM;  /* PAM errors do not map to errno */
1450         } else
1451                 log_error_errno(r, "PAM failed: %m");
1452
1453         if (handle) {
1454                 if (close_session)
1455                         pam_code = pam_close_session(handle, flags);
1456
1457                 (void) pam_end(handle, pam_code | flags);
1458         }
1459
1460         closelog();
1461         return r;
1462 #else
1463         return 0;
1464 #endif
1465 }
1466
1467 static void rename_process_from_path(const char *path) {
1468         _cleanup_free_ char *buf = NULL;
1469         const char *p;
1470
1471         assert(path);
1472
1473         /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
1474          * /bin/ps */
1475
1476         if (path_extract_filename(path, &buf) < 0) {
1477                 rename_process("(...)");
1478                 return;
1479         }
1480
1481         size_t l = strlen(buf);
1482         if (l > 8) {
1483                 /* The end of the process name is usually more interesting, since the first bit might just be
1484                  * "systemd-" */
1485                 p = buf + l - 8;
1486                 l = 8;
1487         } else
1488                 p = buf;
1489
1490         char process_name[11];
1491         process_name[0] = '(';
1492         memcpy(process_name+1, p, l);
1493         process_name[1+l] = ')';
1494         process_name[1+l+1] = 0;
1495
1496         rename_process(process_name);
1497 }
1498
1499 static bool context_has_address_families(const ExecContext *c) {
1500         assert(c);
1501
1502         return c->address_families_allow_list ||
1503                 !set_isempty(c->address_families);
1504 }
1505
1506 static bool context_has_syscall_filters(const ExecContext *c) {
1507         assert(c);
1508
1509         return c->syscall_allow_list ||
1510                 !hashmap_isempty(c->syscall_filter);
1511 }
1512
1513 static bool context_has_syscall_logs(const ExecContext *c) {
1514         assert(c);
1515
1516         return c->syscall_log_allow_list ||
1517                 !hashmap_isempty(c->syscall_log);
1518 }
1519
1520 static bool context_has_no_new_privileges(const ExecContext *c) {
1521         assert(c);
1522
1523         if (c->no_new_privileges)
1524                 return true;
1525
1526         if (have_effective_cap(CAP_SYS_ADMIN) > 0) /* if we are privileged, we don't need NNP */
1527                 return false;
1528
1529         /* We need NNP if we have any form of seccomp and are unprivileged */
1530         return c->lock_personality ||
1531                 c->memory_deny_write_execute ||
1532                 c->private_devices ||
1533                 c->protect_clock ||
1534                 c->protect_hostname ||
1535                 c->protect_kernel_tunables ||
1536                 c->protect_kernel_modules ||
1537                 c->protect_kernel_logs ||
1538                 context_has_address_families(c) ||
1539                 exec_context_restrict_namespaces_set(c) ||
1540                 c->restrict_realtime ||
1541                 c->restrict_suid_sgid ||
1542                 !set_isempty(c->syscall_archs) ||
1543                 context_has_syscall_filters(c) ||
1544                 context_has_syscall_logs(c);
1545 }
1546
1547 bool exec_context_has_credentials(const ExecContext *context) {
1548
1549         assert(context);
1550
1551         return !hashmap_isempty(context->set_credentials) ||
1552                 !hashmap_isempty(context->load_credentials) ||
1553                 !set_isempty(context->import_credentials);
1554 }
1555
1556 #if HAVE_SECCOMP
1557
1558 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1559
1560         if (is_seccomp_available())
1561                 return false;
1562
1563         log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1564         return true;
1565 }
1566
1567 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1568         uint32_t negative_action, default_action, action;
1569         int r;
1570
1571         assert(u);
1572         assert(c);
1573
1574         if (!context_has_syscall_filters(c))
1575                 return 0;
1576
1577         if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1578                 return 0;
1579
1580         negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1581
1582         if (c->syscall_allow_list) {
1583                 default_action = negative_action;
1584                 action = SCMP_ACT_ALLOW;
1585         } else {
1586                 default_action = SCMP_ACT_ALLOW;
1587                 action = negative_action;
1588         }
1589
1590         if (needs_ambient_hack) {
1591                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1592                 if (r < 0)
1593                         return r;
1594         }
1595
1596         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1597 }
1598
1599 static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1600 #ifdef SCMP_ACT_LOG
1601         uint32_t default_action, action;
1602 #endif
1603
1604         assert(u);
1605         assert(c);
1606
1607         if (!context_has_syscall_logs(c))
1608                 return 0;
1609
1610 #ifdef SCMP_ACT_LOG
1611         if (skip_seccomp_unavailable(u, "SystemCallLog="))
1612                 return 0;
1613
1614         if (c->syscall_log_allow_list) {
1615                 /* Log nothing but the ones listed */
1616                 default_action = SCMP_ACT_ALLOW;
1617                 action = SCMP_ACT_LOG;
1618         } else {
1619                 /* Log everything but the ones listed */
1620                 default_action = SCMP_ACT_LOG;
1621                 action = SCMP_ACT_ALLOW;
1622         }
1623
1624         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1625 #else
1626         /* old libseccomp */
1627         log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1628         return 0;
1629 #endif
1630 }
1631
1632 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1633         assert(u);
1634         assert(c);
1635
1636         if (set_isempty(c->syscall_archs))
1637                 return 0;
1638
1639         if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1640                 return 0;
1641
1642         return seccomp_restrict_archs(c->syscall_archs);
1643 }
1644
1645 static int apply_address_families(const Unit* u, const ExecContext *c) {
1646         assert(u);
1647         assert(c);
1648
1649         if (!context_has_address_families(c))
1650                 return 0;
1651
1652         if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1653                 return 0;
1654
1655         return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1656 }
1657
1658 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1659         int r;
1660
1661         assert(u);
1662         assert(c);
1663
1664         if (!c->memory_deny_write_execute)
1665                 return 0;
1666
1667         /* use prctl() if kernel supports it (6.3) */
1668         r = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0, 0, 0);
1669         if (r == 0) {
1670                 log_unit_debug(u, "Enabled MemoryDenyWriteExecute= with PR_SET_MDWE");
1671                 return 0;
1672         }
1673         if (r < 0 && errno != EINVAL)
1674                 return log_unit_debug_errno(u, errno, "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m");
1675         /* else use seccomp */
1676         log_unit_debug(u, "Kernel doesn't support PR_SET_MDWE: falling back to seccomp");
1677
1678         if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1679                 return 0;
1680
1681         return seccomp_memory_deny_write_execute();
1682 }
1683
1684 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1685         assert(u);
1686         assert(c);
1687
1688         if (!c->restrict_realtime)
1689                 return 0;
1690
1691         if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1692                 return 0;
1693
1694         return seccomp_restrict_realtime();
1695 }
1696
1697 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1698         assert(u);
1699         assert(c);
1700
1701         if (!c->restrict_suid_sgid)
1702                 return 0;
1703
1704         if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1705                 return 0;
1706
1707         return seccomp_restrict_suid_sgid();
1708 }
1709
1710 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1711         assert(u);
1712         assert(c);
1713
1714         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1715          * let's protect even those systems where this is left on in the kernel. */
1716
1717         if (!c->protect_kernel_tunables)
1718                 return 0;
1719
1720         if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1721                 return 0;
1722
1723         return seccomp_protect_sysctl();
1724 }
1725
1726 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1727         assert(u);
1728         assert(c);
1729
1730         /* Turn off module syscalls on ProtectKernelModules=yes */
1731
1732         if (!c->protect_kernel_modules)
1733                 return 0;
1734
1735         if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1736                 return 0;
1737
1738         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1739 }
1740
1741 static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1742         assert(u);
1743         assert(c);
1744
1745         if (!c->protect_kernel_logs)
1746                 return 0;
1747
1748         if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1749                 return 0;
1750
1751         return seccomp_protect_syslog();
1752 }
1753
1754 static int apply_protect_clock(const Unit *u, const ExecContext *c) {
1755         assert(u);
1756         assert(c);
1757
1758         if (!c->protect_clock)
1759                 return 0;
1760
1761         if (skip_seccomp_unavailable(u, "ProtectClock="))
1762                 return 0;
1763
1764         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1765 }
1766
1767 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1768         assert(u);
1769         assert(c);
1770
1771         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1772
1773         if (!c->private_devices)
1774                 return 0;
1775
1776         if (skip_seccomp_unavailable(u, "PrivateDevices="))
1777                 return 0;
1778
1779         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1780 }
1781
1782 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1783         assert(u);
1784         assert(c);
1785
1786         if (!exec_context_restrict_namespaces_set(c))
1787                 return 0;
1788
1789         if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1790                 return 0;
1791
1792         return seccomp_restrict_namespaces(c->restrict_namespaces);
1793 }
1794
1795 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1796         unsigned long personality;
1797         int r;
1798
1799         assert(u);
1800         assert(c);
1801
1802         if (!c->lock_personality)
1803                 return 0;
1804
1805         if (skip_seccomp_unavailable(u, "LockPersonality="))
1806                 return 0;
1807
1808         personality = c->personality;
1809
1810         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1811         if (personality == PERSONALITY_INVALID) {
1812
1813                 r = opinionated_personality(&personality);
1814                 if (r < 0)
1815                         return r;
1816         }
1817
1818         return seccomp_lock_personality(personality);
1819 }
1820
1821 #endif
1822
1823 #if HAVE_LIBBPF
1824 static int apply_restrict_filesystems(Unit *u, const ExecContext *c) {
1825         assert(u);
1826         assert(c);
1827
1828         if (!exec_context_restrict_filesystems_set(c))
1829                 return 0;
1830
1831         if (!u->manager->restrict_fs) {
1832                 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1833                 log_unit_debug(u, "LSM BPF not supported, skipping RestrictFileSystems=");
1834                 return 0;
1835         }
1836
1837         return lsm_bpf_unit_restrict_filesystems(u, c->restrict_filesystems, c->restrict_filesystems_allow_list);
1838 }
1839 #endif
1840
1841 static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
1842         assert(u);
1843         assert(c);
1844
1845         if (!c->protect_hostname)
1846                 return 0;
1847
1848         if (ns_type_supported(NAMESPACE_UTS)) {
1849                 if (unshare(CLONE_NEWUTS) < 0) {
1850                         if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1851                                 *ret_exit_status = EXIT_NAMESPACE;
1852                                 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1853                         }
1854
1855                         log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1856                 }
1857         } else
1858                 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1859
1860 #if HAVE_SECCOMP
1861         int r;
1862
1863         if (skip_seccomp_unavailable(u, "ProtectHostname="))
1864                 return 0;
1865
1866         r = seccomp_protect_hostname();
1867         if (r < 0) {
1868                 *ret_exit_status = EXIT_SECCOMP;
1869                 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1870         }
1871 #endif
1872
1873         return 0;
1874 }
1875
1876 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1877         assert(idle_pipe);
1878
1879         idle_pipe[1] = safe_close(idle_pipe[1]);
1880         idle_pipe[2] = safe_close(idle_pipe[2]);
1881
1882         if (idle_pipe[0] >= 0) {
1883                 int r;
1884
1885                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1886
1887                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1888                         ssize_t n;
1889
1890                         /* Signal systemd that we are bored and want to continue. */
1891                         n = write(idle_pipe[3], "x", 1);
1892                         if (n > 0)
1893                                 /* Wait for systemd to react to the signal above. */
1894                                 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1895                 }
1896
1897                 idle_pipe[0] = safe_close(idle_pipe[0]);
1898
1899         }
1900
1901         idle_pipe[3] = safe_close(idle_pipe[3]);
1902 }
1903
1904 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1905
1906 static int build_environment(
1907                 const Unit *u,
1908                 const ExecContext *c,
1909                 const ExecParameters *p,
1910                 const CGroupContext *cgroup_context,
1911                 size_t n_fds,
1912                 char **fdnames,
1913                 const char *home,
1914                 const char *username,
1915                 const char *shell,
1916                 dev_t journal_stream_dev,
1917                 ino_t journal_stream_ino,
1918                 const char *memory_pressure_path,
1919                 char ***ret) {
1920
1921         _cleanup_strv_free_ char **our_env = NULL;
1922         size_t n_env = 0;
1923         char *x;
1924         int r;
1925
1926         assert(u);
1927         assert(c);
1928         assert(p);
1929         assert(ret);
1930
1931 #define N_ENV_VARS 19
1932         our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1933         if (!our_env)
1934                 return -ENOMEM;
1935
1936         if (n_fds > 0) {
1937                 _cleanup_free_ char *joined = NULL;
1938
1939                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1940                         return -ENOMEM;
1941                 our_env[n_env++] = x;
1942
1943                 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1944                         return -ENOMEM;
1945                 our_env[n_env++] = x;
1946
1947                 joined = strv_join(fdnames, ":");
1948                 if (!joined)
1949                         return -ENOMEM;
1950
1951                 x = strjoin("LISTEN_FDNAMES=", joined);
1952                 if (!x)
1953                         return -ENOMEM;
1954                 our_env[n_env++] = x;
1955         }
1956
1957         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1958                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1959                         return -ENOMEM;
1960                 our_env[n_env++] = x;
1961
1962                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1963                         return -ENOMEM;
1964                 our_env[n_env++] = x;
1965         }
1966
1967         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1968          * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1969          * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1970         if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
1971                 x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
1972                 if (!x)
1973                         return -ENOMEM;
1974                 our_env[n_env++] = x;
1975         }
1976
1977         if (home) {
1978                 x = strjoin("HOME=", home);
1979                 if (!x)
1980                         return -ENOMEM;
1981
1982                 path_simplify(x + 5);
1983                 our_env[n_env++] = x;
1984         }
1985
1986         if (username) {
1987                 x = strjoin("LOGNAME=", username);
1988                 if (!x)
1989                         return -ENOMEM;
1990                 our_env[n_env++] = x;
1991
1992                 x = strjoin("USER=", username);
1993                 if (!x)
1994                         return -ENOMEM;
1995                 our_env[n_env++] = x;
1996         }
1997
1998         if (shell) {
1999                 x = strjoin("SHELL=", shell);
2000                 if (!x)
2001                         return -ENOMEM;
2002
2003                 path_simplify(x + 6);
2004                 our_env[n_env++] = x;
2005         }
2006
2007         if (!sd_id128_is_null(u->invocation_id)) {
2008                 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
2009                         return -ENOMEM;
2010
2011                 our_env[n_env++] = x;
2012         }
2013
2014         if (exec_context_needs_term(c)) {
2015                 _cleanup_free_ char *cmdline = NULL;
2016                 const char *tty_path, *term = NULL;
2017
2018                 tty_path = exec_context_tty_path(c);
2019
2020                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
2021                  * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
2022                  * container manager passes to PID 1 ends up all the way in the console login shown. */
2023
2024                 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
2025                         term = getenv("TERM");
2026                 else if (tty_path && in_charset(skip_dev_prefix(tty_path), ALPHANUMERICAL)) {
2027                         _cleanup_free_ char *key = NULL;
2028
2029                         key = strjoin("systemd.tty.term.", skip_dev_prefix(tty_path));
2030                         if (!key)
2031                                 return -ENOMEM;
2032
2033                         r = proc_cmdline_get_key(key, 0, &cmdline);
2034                         if (r < 0)
2035                                 log_debug_errno(r, "Failed to read %s from kernel cmdline, ignoring: %m", key);
2036                         else if (r > 0)
2037                                 term = cmdline;
2038                 }
2039
2040                 if (!term)
2041                         term = default_term_for_tty(tty_path);
2042
2043                 x = strjoin("TERM=", term);
2044                 if (!x)
2045                         return -ENOMEM;
2046                 our_env[n_env++] = x;
2047         }
2048
2049         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
2050                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
2051                         return -ENOMEM;
2052
2053                 our_env[n_env++] = x;
2054         }
2055
2056         if (c->log_namespace) {
2057                 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
2058                 if (!x)
2059                         return -ENOMEM;
2060
2061                 our_env[n_env++] = x;
2062         }
2063
2064         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2065                 _cleanup_free_ char *joined = NULL;
2066                 const char *n;
2067
2068                 if (!p->prefix[t])
2069                         continue;
2070
2071                 if (c->directories[t].n_items == 0)
2072                         continue;
2073
2074                 n = exec_directory_env_name_to_string(t);
2075                 if (!n)
2076                         continue;
2077
2078                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
2079                         _cleanup_free_ char *prefixed = NULL;
2080
2081                         prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
2082                         if (!prefixed)
2083                                 return -ENOMEM;
2084
2085                         if (!strextend_with_separator(&joined, ":", prefixed))
2086                                 return -ENOMEM;
2087                 }
2088
2089                 x = strjoin(n, "=", joined);
2090                 if (!x)
2091                         return -ENOMEM;
2092
2093                 our_env[n_env++] = x;
2094         }
2095
2096         if (exec_context_has_credentials(c) && p->prefix[EXEC_DIRECTORY_RUNTIME]) {
2097                 x = strjoin("CREDENTIALS_DIRECTORY=", p->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
2098                 if (!x)
2099                         return -ENOMEM;
2100
2101                 our_env[n_env++] = x;
2102         }
2103
2104         if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
2105                 return -ENOMEM;
2106
2107         our_env[n_env++] = x;
2108
2109         if (memory_pressure_path) {
2110                 x = strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path);
2111                 if (!x)
2112                         return -ENOMEM;
2113
2114                 our_env[n_env++] = x;
2115
2116                 if (cgroup_context && !path_equal(memory_pressure_path, "/dev/null")) {
2117                         _cleanup_free_ char *b = NULL, *e = NULL;
2118
2119                         if (asprintf(&b, "%s " USEC_FMT " " USEC_FMT,
2120                                      MEMORY_PRESSURE_DEFAULT_TYPE,
2121                                      cgroup_context->memory_pressure_threshold_usec == USEC_INFINITY ? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC :
2122                                      CLAMP(cgroup_context->memory_pressure_threshold_usec, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC),
2123                                      MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2124                                 return -ENOMEM;
2125
2126                         if (base64mem(b, strlen(b) + 1, &e) < 0)
2127                                 return -ENOMEM;
2128
2129                         x = strjoin("MEMORY_PRESSURE_WRITE=", e);
2130                         if (!x)
2131                                 return -ENOMEM;
2132
2133                         our_env[n_env++] = x;
2134                 }
2135         }
2136
2137         assert(n_env < N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
2138 #undef N_ENV_VARS
2139
2140         *ret = TAKE_PTR(our_env);
2141
2142         return 0;
2143 }
2144
2145 static int build_pass_environment(const ExecContext *c, char ***ret) {
2146         _cleanup_strv_free_ char **pass_env = NULL;
2147         size_t n_env = 0;
2148
2149         STRV_FOREACH(i, c->pass_environment) {
2150                 _cleanup_free_ char *x = NULL;
2151                 char *v;
2152
2153                 v = getenv(*i);
2154                 if (!v)
2155                         continue;
2156                 x = strjoin(*i, "=", v);
2157                 if (!x)
2158                         return -ENOMEM;
2159
2160                 if (!GREEDY_REALLOC(pass_env, n_env + 2))
2161                         return -ENOMEM;
2162
2163                 pass_env[n_env++] = TAKE_PTR(x);
2164                 pass_env[n_env] = NULL;
2165         }
2166
2167         *ret = TAKE_PTR(pass_env);
2168
2169         return 0;
2170 }
2171
2172 bool exec_needs_network_namespace(const ExecContext *context) {
2173         assert(context);
2174
2175         return context->private_network || context->network_namespace_path;
2176 }
2177
2178 static bool exec_needs_ephemeral(const ExecContext *context) {
2179         return (context->root_image || context->root_directory) && context->root_ephemeral;
2180 }
2181
2182 static bool exec_needs_ipc_namespace(const ExecContext *context) {
2183         assert(context);
2184
2185         return context->private_ipc || context->ipc_namespace_path;
2186 }
2187
2188 bool exec_needs_mount_namespace(
2189                 const ExecContext *context,
2190                 const ExecParameters *params,
2191                 const ExecRuntime *runtime) {
2192
2193         assert(context);
2194
2195         if (context->root_image)
2196                 return true;
2197
2198         if (!strv_isempty(context->read_write_paths) ||
2199             !strv_isempty(context->read_only_paths) ||
2200             !strv_isempty(context->inaccessible_paths) ||
2201             !strv_isempty(context->exec_paths) ||
2202             !strv_isempty(context->no_exec_paths))
2203                 return true;
2204
2205         if (context->n_bind_mounts > 0)
2206                 return true;
2207
2208         if (context->n_temporary_filesystems > 0)
2209                 return true;
2210
2211         if (context->n_mount_images > 0)
2212                 return true;
2213
2214         if (context->n_extension_images > 0)
2215                 return true;
2216
2217         if (!strv_isempty(context->extension_directories))
2218                 return true;
2219
2220         if (!IN_SET(context->mount_propagation_flag, 0, MS_SHARED))
2221                 return true;
2222
2223         if (context->private_tmp && runtime && runtime->shared && (runtime->shared->tmp_dir || runtime->shared->var_tmp_dir))
2224                 return true;
2225
2226         if (context->private_devices ||
2227             context->private_mounts > 0 ||
2228             (context->private_mounts < 0 && exec_needs_network_namespace(context)) ||
2229             context->protect_system != PROTECT_SYSTEM_NO ||
2230             context->protect_home != PROTECT_HOME_NO ||
2231             context->protect_kernel_tunables ||
2232             context->protect_kernel_modules ||
2233             context->protect_kernel_logs ||
2234             context->protect_control_groups ||
2235             context->protect_proc != PROTECT_PROC_DEFAULT ||
2236             context->proc_subset != PROC_SUBSET_ALL ||
2237             exec_needs_ipc_namespace(context))
2238                 return true;
2239
2240         if (context->root_directory) {
2241                 if (exec_context_get_effective_mount_apivfs(context))
2242                         return true;
2243
2244                 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2245                         if (params && !params->prefix[t])
2246                                 continue;
2247
2248                         if (context->directories[t].n_items > 0)
2249                                 return true;
2250                 }
2251         }
2252
2253         if (context->dynamic_user &&
2254             (context->directories[EXEC_DIRECTORY_STATE].n_items > 0 ||
2255              context->directories[EXEC_DIRECTORY_CACHE].n_items > 0 ||
2256              context->directories[EXEC_DIRECTORY_LOGS].n_items > 0))
2257                 return true;
2258
2259         if (context->log_namespace)
2260                 return true;
2261
2262         return false;
2263 }
2264
2265 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
2266         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2267         _cleanup_close_pair_ int errno_pipe[2] = PIPE_EBADF;
2268         _cleanup_close_ int unshare_ready_fd = -EBADF;
2269         _cleanup_(sigkill_waitp) pid_t pid = 0;
2270         uint64_t c = 1;
2271         ssize_t n;
2272         int r;
2273
2274         /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2275          * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2276          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2277          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2278          * which waits for the parent to create the new user namespace while staying in the original namespace. The
2279          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2280          * continues execution normally.
2281          * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2282          * does not need CAP_SETUID to write the single line mapping to itself. */
2283
2284         /* Can only set up multiple mappings with CAP_SETUID. */
2285         if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid))
2286                 r = asprintf(&uid_map,
2287                              UID_FMT " " UID_FMT " 1\n"     /* Map $OUID → $OUID */
2288                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
2289                              ouid, ouid, uid, uid);
2290         else
2291                 r = asprintf(&uid_map,
2292                              UID_FMT " " UID_FMT " 1\n",    /* Map $OUID → $OUID */
2293                              ouid, ouid);
2294
2295         if (r < 0)
2296                 return -ENOMEM;
2297
2298         /* Can only set up multiple mappings with CAP_SETGID. */
2299         if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid))
2300                 r = asprintf(&gid_map,
2301                              GID_FMT " " GID_FMT " 1\n"     /* Map $OGID → $OGID */
2302                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
2303                              ogid, ogid, gid, gid);
2304         else
2305                 r = asprintf(&gid_map,
2306                              GID_FMT " " GID_FMT " 1\n",    /* Map $OGID -> $OGID */
2307                              ogid, ogid);
2308
2309         if (r < 0)
2310                 return -ENOMEM;
2311
2312         /* Create a communication channel so that the parent can tell the child when it finished creating the user
2313          * namespace. */
2314         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2315         if (unshare_ready_fd < 0)
2316                 return -errno;
2317
2318         /* Create a communication channel so that the child can tell the parent a proper error code in case it
2319          * failed. */
2320         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2321                 return -errno;
2322
2323         r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2324         if (r < 0)
2325                 return r;
2326         if (r == 0) {
2327                 _cleanup_close_ int fd = -EBADF;
2328                 const char *a;
2329                 pid_t ppid;
2330
2331                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2332                  * here, after the parent opened its own user namespace. */
2333
2334                 ppid = getppid();
2335                 errno_pipe[0] = safe_close(errno_pipe[0]);
2336
2337                 /* Wait until the parent unshared the user namespace */
2338                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2339                         r = -errno;
2340                         goto child_fail;
2341                 }
2342
2343                 /* Disable the setgroups() system call in the child user namespace, for good. */
2344                 a = procfs_file_alloca(ppid, "setgroups");
2345                 fd = open(a, O_WRONLY|O_CLOEXEC);
2346                 if (fd < 0) {
2347                         if (errno != ENOENT) {
2348                                 r = -errno;
2349                                 goto child_fail;
2350                         }
2351
2352                         /* If the file is missing the kernel is too old, let's continue anyway. */
2353                 } else {
2354                         if (write(fd, "deny\n", 5) < 0) {
2355                                 r = -errno;
2356                                 goto child_fail;
2357                         }
2358
2359                         fd = safe_close(fd);
2360                 }
2361
2362                 /* First write the GID map */
2363                 a = procfs_file_alloca(ppid, "gid_map");
2364                 fd = open(a, O_WRONLY|O_CLOEXEC);
2365                 if (fd < 0) {
2366                         r = -errno;
2367                         goto child_fail;
2368                 }
2369                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2370                         r = -errno;
2371                         goto child_fail;
2372                 }
2373                 fd = safe_close(fd);
2374
2375                 /* The write the UID map */
2376                 a = procfs_file_alloca(ppid, "uid_map");
2377                 fd = open(a, O_WRONLY|O_CLOEXEC);
2378                 if (fd < 0) {
2379                         r = -errno;
2380                         goto child_fail;
2381                 }
2382                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2383                         r = -errno;
2384                         goto child_fail;
2385                 }
2386
2387                 _exit(EXIT_SUCCESS);
2388
2389         child_fail:
2390                 (void) write(errno_pipe[1], &r, sizeof(r));
2391                 _exit(EXIT_FAILURE);
2392         }
2393
2394         errno_pipe[1] = safe_close(errno_pipe[1]);
2395
2396         if (unshare(CLONE_NEWUSER) < 0)
2397                 return -errno;
2398
2399         /* Let the child know that the namespace is ready now */
2400         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2401                 return -errno;
2402
2403         /* Try to read an error code from the child */
2404         n = read(errno_pipe[0], &r, sizeof(r));
2405         if (n < 0)
2406                 return -errno;
2407         if (n == sizeof(r)) { /* an error code was sent to us */
2408                 if (r < 0)
2409                         return r;
2410                 return -EIO;
2411         }
2412         if (n != 0) /* on success we should have read 0 bytes */
2413                 return -EIO;
2414
2415         r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
2416         if (r < 0)
2417                 return r;
2418         if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2419                 return -EIO;
2420
2421         return 0;
2422 }
2423
2424 static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2425         assert(context);
2426
2427         if (!context->dynamic_user)
2428                 return false;
2429
2430         if (type == EXEC_DIRECTORY_CONFIGURATION)
2431                 return false;
2432
2433         if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2434                 return false;
2435
2436         return true;
2437 }
2438
2439 static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2440         _cleanup_free_ char *src_abs = NULL;
2441         int r;
2442
2443         assert(source);
2444
2445         src_abs = path_join(root, source);
2446         if (!src_abs)
2447                 return -ENOMEM;
2448
2449         STRV_FOREACH(dst, symlinks) {
2450                 _cleanup_free_ char *dst_abs = NULL;
2451
2452                 dst_abs = path_join(root, *dst);
2453                 if (!dst_abs)
2454                         return -ENOMEM;
2455
2456                 r = mkdir_parents_label(dst_abs, 0755);
2457                 if (r < 0)
2458                         return r;
2459
2460                 r = symlink_idempotent(src_abs, dst_abs, true);
2461                 if (r < 0)
2462                         return r;
2463         }
2464
2465         return 0;
2466 }
2467
2468 static int setup_exec_directory(
2469                 Unit *u,
2470                 const ExecContext *context,
2471                 const ExecParameters *params,
2472                 uid_t uid,
2473                 gid_t gid,
2474                 ExecDirectoryType type,
2475                 bool needs_mount_namespace,
2476                 int *exit_status) {
2477
2478         static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2479                 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2480                 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2481                 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2482                 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2483                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2484         };
2485         int r;
2486
2487         assert(context);
2488         assert(params);
2489         assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2490         assert(exit_status);
2491
2492         if (!params->prefix[type])
2493                 return 0;
2494
2495         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2496                 if (!uid_is_valid(uid))
2497                         uid = 0;
2498                 if (!gid_is_valid(gid))
2499                         gid = 0;
2500         }
2501
2502         for (size_t i = 0; i < context->directories[type].n_items; i++) {
2503                 _cleanup_free_ char *p = NULL, *pp = NULL;
2504
2505                 p = path_join(params->prefix[type], context->directories[type].items[i].path);
2506                 if (!p) {
2507                         r = -ENOMEM;
2508                         goto fail;
2509                 }
2510
2511                 r = mkdir_parents_label(p, 0755);
2512                 if (r < 0)
2513                         goto fail;
2514
2515                 if (IN_SET(type, EXEC_DIRECTORY_STATE, EXEC_DIRECTORY_LOGS) && params->runtime_scope == RUNTIME_SCOPE_USER) {
2516
2517                         /* If we are in user mode, and a configuration directory exists but a state directory
2518                          * doesn't exist, then we likely are upgrading from an older systemd version that
2519                          * didn't know the more recent addition to the xdg-basedir spec: the $XDG_STATE_HOME
2520                          * directory. In older systemd versions EXEC_DIRECTORY_STATE was aliased to
2521                          * EXEC_DIRECTORY_CONFIGURATION, with the advent of $XDG_STATE_HOME is is now
2522                          * seperated. If a service has both dirs configured but only the configuration dir
2523                          * exists and the state dir does not, we assume we are looking at an update
2524                          * situation. Hence, create a compatibility symlink, so that all expectations are
2525                          * met.
2526                          *
2527                          * (We also do something similar with the log directory, which still doesn't exist in
2528                          * the xdg basedir spec. We'll make it a subdir of the state dir.) */
2529
2530                         /* this assumes the state dir is always created before the configuration dir */
2531                         assert_cc(EXEC_DIRECTORY_STATE < EXEC_DIRECTORY_LOGS);
2532                         assert_cc(EXEC_DIRECTORY_LOGS < EXEC_DIRECTORY_CONFIGURATION);
2533
2534                         r = laccess(p, F_OK);
2535                         if (r == -ENOENT) {
2536                                 _cleanup_free_ char *q = NULL;
2537
2538                                 /* OK, we know that the state dir does not exist. Let's see if the dir exists
2539                                  * under the configuration hierarchy. */
2540
2541                                 if (type == EXEC_DIRECTORY_STATE)
2542                                         q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], context->directories[type].items[i].path);
2543                                 else if (type == EXEC_DIRECTORY_LOGS)
2544                                         q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], "log", context->directories[type].items[i].path);
2545                                 else
2546                                         assert_not_reached();
2547                                 if (!q) {
2548                                         r = -ENOMEM;
2549                                         goto fail;
2550                                 }
2551
2552                                 r = laccess(q, F_OK);
2553                                 if (r >= 0) {
2554                                         /* It does exist! This hence looks like an update. Symlink the
2555                                          * configuration directory into the state directory. */
2556
2557                                         r = symlink_idempotent(q, p, /* make_relative= */ true);
2558                                         if (r < 0)
2559                                                 goto fail;
2560
2561                                         log_unit_notice(u, "Unit state directory %s missing but matching configuration directory %s exists, assuming update from systemd 253 or older, creating compatibility symlink.", p, q);
2562                                         continue;
2563                                 } else if (r != -ENOENT)
2564                                         log_unit_warning_errno(u, r, "Unable to detect whether unit configuration directory '%s' exists, assuming not: %m", q);
2565
2566                         } else if (r < 0)
2567                                 log_unit_warning_errno(u, r, "Unable to detect whether unit state directory '%s' is missing, assuming it is: %m", p);
2568                 }
2569
2570                 if (exec_directory_is_private(context, type)) {
2571                         /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2572                          * case we want to avoid leaving a directory around fully accessible that is owned by
2573                          * a dynamic user whose UID is later on reused. To lock this down we use the same
2574                          * trick used by container managers to prohibit host users to get access to files of
2575                          * the same UID in containers: we place everything inside a directory that has an
2576                          * access mode of 0700 and is owned root:root, so that it acts as security boundary
2577                          * for unprivileged host code. We then use fs namespacing to make this directory
2578                          * permeable for the service itself.
2579                          *
2580                          * Specifically: for a service which wants a special directory "foo/" we first create
2581                          * a directory "private/" with access mode 0700 owned by root:root. Then we place
2582                          * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2583                          * "private/foo". This way, privileged host users can access "foo/" as usual, but
2584                          * unprivileged host users can't look into it. Inside of the namespace of the unit
2585                          * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2586                          * "private/foo/" is mounted under the same name, thus disabling the access boundary
2587                          * for the service and making sure it only gets access to the dirs it needs but no
2588                          * others. Tricky? Yes, absolutely, but it works!
2589                          *
2590                          * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2591                          * to be owned by the service itself.
2592                          *
2593                          * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2594                          * for sharing files or sockets with other services. */
2595
2596                         pp = path_join(params->prefix[type], "private");
2597                         if (!pp) {
2598                                 r = -ENOMEM;
2599                                 goto fail;
2600                         }
2601
2602                         /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2603                         r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
2604                         if (r < 0)
2605                                 goto fail;
2606
2607                         if (!path_extend(&pp, context->directories[type].items[i].path)) {
2608                                 r = -ENOMEM;
2609                                 goto fail;
2610                         }
2611
2612                         /* Create all directories between the configured directory and this private root, and mark them 0755 */
2613                         r = mkdir_parents_label(pp, 0755);
2614                         if (r < 0)
2615                                 goto fail;
2616
2617                         if (is_dir(p, false) > 0 &&
2618                             (laccess(pp, F_OK) == -ENOENT)) {
2619
2620                                 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2621                                  * it over. Most likely the service has been upgraded from one that didn't use
2622                                  * DynamicUser=1, to one that does. */
2623
2624                                 log_unit_info(u, "Found pre-existing public %s= directory %s, migrating to %s.\n"
2625                                               "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2626                                               exec_directory_type_to_string(type), p, pp);
2627
2628                                 r = RET_NERRNO(rename(p, pp));
2629                                 if (r < 0)
2630                                         goto fail;
2631                         } else {
2632                                 /* Otherwise, create the actual directory for the service */
2633
2634                                 r = mkdir_label(pp, context->directories[type].mode);
2635                                 if (r < 0 && r != -EEXIST)
2636                                         goto fail;
2637                         }
2638
2639                         if (!context->directories[type].items[i].only_create) {
2640                                 /* And link it up from the original place.
2641                                  * Notes
2642                                  * 1) If a mount namespace is going to be used, then this symlink remains on
2643                                  *    the host, and a new one for the child namespace will be created later.
2644                                  * 2) It is not necessary to create this symlink when one of its parent
2645                                  *    directories is specified and already created. E.g.
2646                                  *        StateDirectory=foo foo/bar
2647                                  *    In that case, the inode points to pp and p for "foo/bar" are the same:
2648                                  *        pp = "/var/lib/private/foo/bar"
2649                                  *        p = "/var/lib/foo/bar"
2650                                  *    and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2651                                  *    we do not need to create the symlink, but we cannot create the symlink.
2652                                  *    See issue #24783. */
2653                                 r = symlink_idempotent(pp, p, true);
2654                                 if (r < 0)
2655                                         goto fail;
2656                         }
2657
2658                 } else {
2659                         _cleanup_free_ char *target = NULL;
2660
2661                         if (type != EXEC_DIRECTORY_CONFIGURATION &&
2662                             readlink_and_make_absolute(p, &target) >= 0) {
2663                                 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2664
2665                                 /* This already exists and is a symlink? Interesting. Maybe it's one created
2666                                  * by DynamicUser=1 (see above)?
2667                                  *
2668                                  * We do this for all directory types except for ConfigurationDirectory=,
2669                                  * since they all support the private/ symlink logic at least in some
2670                                  * configurations, see above. */
2671
2672                                 r = chase(target, NULL, 0, &target_resolved, NULL);
2673                                 if (r < 0)
2674                                         goto fail;
2675
2676                                 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
2677                                 if (!q) {
2678                                         r = -ENOMEM;
2679                                         goto fail;
2680                                 }
2681
2682                                 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2683                                 r = chase(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2684                                 if (r < 0)
2685                                         goto fail;
2686
2687                                 if (path_equal(q_resolved, target_resolved)) {
2688
2689                                         /* Hmm, apparently DynamicUser= was once turned on for this service,
2690                                          * but is no longer. Let's move the directory back up. */
2691
2692                                         log_unit_info(u, "Found pre-existing private %s= directory %s, migrating to %s.\n"
2693                                                       "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2694                                                       exec_directory_type_to_string(type), q, p);
2695
2696                                         r = RET_NERRNO(unlink(p));
2697                                         if (r < 0)
2698                                                 goto fail;
2699
2700                                         r = RET_NERRNO(rename(q, p));
2701                                         if (r < 0)
2702                                                 goto fail;
2703                                 }
2704                         }
2705
2706                         r = mkdir_label(p, context->directories[type].mode);
2707                         if (r < 0) {
2708                                 if (r != -EEXIST)
2709                                         goto fail;
2710
2711                                 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2712                                         struct stat st;
2713
2714                                         /* Don't change the owner/access mode of the configuration directory,
2715                                          * as in the common case it is not written to by a service, and shall
2716                                          * not be writable. */
2717
2718                                         r = RET_NERRNO(stat(p, &st));
2719                                         if (r < 0)
2720                                                 goto fail;
2721
2722                                         /* Still complain if the access mode doesn't match */
2723                                         if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2724                                                 log_unit_warning(u, "%s \'%s\' already exists but the mode is different. "
2725                                                                  "(File system: %o %sMode: %o)",
2726                                                                  exec_directory_type_to_string(type), context->directories[type].items[i].path,
2727                                                                  st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2728
2729                                         continue;
2730                                 }
2731                         }
2732                 }
2733
2734                 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2735                  * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2736                  * current UID/GID ownership.) */
2737                 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2738                 if (r < 0)
2739                         goto fail;
2740
2741                 /* Skip the rest (which deals with ownership) in user mode, since ownership changes are not
2742                  * available to user code anyway */
2743                 if (params->runtime_scope != RUNTIME_SCOPE_SYSTEM)
2744                         continue;
2745
2746                 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2747                  * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2748                  * assignments to exist. */
2749                 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777, AT_SYMLINK_FOLLOW);
2750                 if (r < 0)
2751                         goto fail;
2752         }
2753
2754         /* If we are not going to run in a namespace, set up the symlinks - otherwise
2755          * they are set up later, to allow configuring empty var/run/etc. */
2756         if (!needs_mount_namespace)
2757                 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2758                         r = create_many_symlinks(params->prefix[type],
2759                                                  context->directories[type].items[i].path,
2760                                                  context->directories[type].items[i].symlinks);
2761                         if (r < 0)
2762                                 goto fail;
2763                 }
2764
2765         return 0;
2766
2767 fail:
2768         *exit_status = exit_status_table[type];
2769         return r;
2770 }
2771
2772 static int write_credential(
2773                 int dfd,
2774                 const char *id,
2775                 const void *data,
2776                 size_t size,
2777                 uid_t uid,
2778                 bool ownership_ok) {
2779
2780         _cleanup_(unlink_and_freep) char *tmp = NULL;
2781         _cleanup_close_ int fd = -EBADF;
2782         int r;
2783
2784         r = tempfn_random_child("", "cred", &tmp);
2785         if (r < 0)
2786                 return r;
2787
2788         fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
2789         if (fd < 0) {
2790                 tmp = mfree(tmp);
2791                 return -errno;
2792         }
2793
2794         r = loop_write(fd, data, size, /* do_poll = */ false);
2795         if (r < 0)
2796                 return r;
2797
2798         if (fchmod(fd, 0400) < 0) /* Take away "w" bit */
2799                 return -errno;
2800
2801         if (uid_is_valid(uid) && uid != getuid()) {
2802                 r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
2803                 if (r < 0) {
2804                         if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2805                                 return r;
2806
2807                         if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want
2808                                             * to express: that the user gets read access and nothing
2809                                             * else. But if the backing fs can't support that (e.g. ramfs)
2810                                             * then we can use file ownership instead. But that's only safe if
2811                                             * we can then re-mount the whole thing read-only, so that the
2812                                             * user can no longer chmod() the file to gain write access. */
2813                                 return r;
2814
2815                         if (fchown(fd, uid, GID_INVALID) < 0)
2816                                 return -errno;
2817                 }
2818         }
2819
2820         if (renameat(dfd, tmp, dfd, id) < 0)
2821                 return -errno;
2822
2823         tmp = mfree(tmp);
2824         return 0;
2825 }
2826
2827 typedef enum CredentialSearchPath {
2828         CREDENTIAL_SEARCH_PATH_TRUSTED,
2829         CREDENTIAL_SEARCH_PATH_ENCRYPTED,
2830         CREDENTIAL_SEARCH_PATH_ALL,
2831         _CREDENTIAL_SEARCH_PATH_MAX,
2832         _CREDENTIAL_SEARCH_PATH_INVALID = -EINVAL,
2833 } CredentialSearchPath;
2834
2835 static char **credential_search_path(const ExecParameters *params, CredentialSearchPath path) {
2836
2837         _cleanup_strv_free_ char **l = NULL;
2838
2839         assert(params);
2840         assert(path >= 0 && path < _CREDENTIAL_SEARCH_PATH_MAX);
2841
2842         /* Assemble a search path to find credentials in. For non-encrypted credentials, We'll look in
2843          * /etc/credstore/ (and similar directories in /usr/lib/ + /run/). If we're looking for encrypted
2844          * credentials, we'll look in /etc/credstore.encrypted/ (and similar dirs). */
2845
2846         if (IN_SET(path, CREDENTIAL_SEARCH_PATH_ENCRYPTED, CREDENTIAL_SEARCH_PATH_ALL)) {
2847                 if (strv_extend(&l, params->received_encrypted_credentials_directory) < 0)
2848                         return NULL;
2849
2850                 if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore.encrypted"), /* filter_duplicates= */ true) < 0)
2851                         return NULL;
2852         }
2853
2854         if (IN_SET(path, CREDENTIAL_SEARCH_PATH_TRUSTED, CREDENTIAL_SEARCH_PATH_ALL)) {
2855                 if (params->received_credentials_directory)
2856                         if (strv_extend(&l, params->received_credentials_directory) < 0)
2857                                 return NULL;
2858
2859                 if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore"), /* filter_duplicates= */ true) < 0)
2860                         return NULL;
2861         }
2862
2863         if (DEBUG_LOGGING) {
2864                 _cleanup_free_ char *t = strv_join(l, ":");
2865
2866                 log_debug("Credential search path is: %s", strempty(t));
2867         }
2868
2869         return TAKE_PTR(l);
2870 }
2871
2872 static int maybe_decrypt_and_write_credential(
2873                 int dir_fd,
2874                 const char *id,
2875                 bool encrypted,
2876                 uid_t uid,
2877                 bool ownership_ok,
2878                 const char *data,
2879                 size_t size,
2880                 uint64_t *left) {
2881
2882         _cleanup_free_ void *plaintext = NULL;
2883         size_t add;
2884         int r;
2885
2886         if (encrypted) {
2887                 size_t plaintext_size = 0;
2888
2889                 r = decrypt_credential_and_warn(id, now(CLOCK_REALTIME), NULL, NULL, data, size,
2890                                                 &plaintext, &plaintext_size);
2891                 if (r < 0)
2892                         return r;
2893
2894                 data = plaintext;
2895                 size = plaintext_size;
2896         }
2897
2898         add = strlen(id) + size;
2899         if (add > *left)
2900                 return -E2BIG;
2901
2902         r = write_credential(dir_fd, id, data, size, uid, ownership_ok);
2903         if (r < 0)
2904                 return log_debug_errno(r, "Failed to write credential '%s': %m", id);
2905
2906         *left -= add;
2907         return 0;
2908 }
2909
2910 static int load_credential_glob(
2911                 const char *path,
2912                 bool encrypted,
2913                 char **search_path,
2914                 ReadFullFileFlags flags,
2915                 int write_dfd,
2916                 uid_t uid,
2917                 bool ownership_ok,
2918                 uint64_t *left) {
2919
2920         int r;
2921
2922         STRV_FOREACH(d, search_path) {
2923                 _cleanup_globfree_ glob_t pglob = {};
2924                 _cleanup_free_ char *j = NULL;
2925
2926                 j = path_join(*d, path);
2927                 if (!j)
2928                         return -ENOMEM;
2929
2930                 r = safe_glob(j, 0, &pglob);
2931                 if (r == -ENOENT)
2932                         continue;
2933                 if (r < 0)
2934                         return r;
2935
2936                 for (size_t n = 0; n < pglob.gl_pathc; n++) {
2937                         _cleanup_free_ char *fn = NULL;
2938                         _cleanup_(erase_and_freep) char *data = NULL;
2939                         size_t size;
2940
2941                         /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
2942                         r = read_full_file_full(
2943                                 AT_FDCWD,
2944                                 pglob.gl_pathv[n],
2945                                 UINT64_MAX,
2946                                 encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX,
2947                                 flags,
2948                                 NULL,
2949                                 &data, &size);
2950                         if (r < 0)
2951                                 return log_debug_errno(r, "Failed to read credential '%s': %m",
2952                                                         pglob.gl_pathv[n]);
2953
2954                         r = path_extract_filename(pglob.gl_pathv[n], &fn);
2955                         if (r < 0)
2956                                 return log_debug_errno(r, "Failed to extract filename from '%s': %m",
2957                                                         pglob.gl_pathv[n]);
2958
2959                         r = maybe_decrypt_and_write_credential(
2960                                 write_dfd,
2961                                 fn,
2962                                 encrypted,
2963                                 uid,
2964                                 ownership_ok,
2965                                 data, size,
2966                                 left);
2967                         if (r == -EEXIST)
2968                                 continue;
2969                         if (r < 0)
2970                                 return r;
2971                 }
2972         }
2973
2974         return 0;
2975 }
2976
2977 static int load_credential(
2978                 const ExecContext *context,
2979                 const ExecParameters *params,
2980                 const char *id,
2981                 const char *path,
2982                 bool encrypted,
2983                 const char *unit,
2984                 int read_dfd,
2985                 int write_dfd,
2986                 uid_t uid,
2987                 bool ownership_ok,
2988                 uint64_t *left) {
2989
2990         ReadFullFileFlags flags = READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER;
2991         _cleanup_strv_free_ char **search_path = NULL;
2992         _cleanup_(erase_and_freep) char *data = NULL;
2993         _cleanup_free_ char *bindname = NULL;
2994         const char *source = NULL;
2995         bool missing_ok = true;
2996         size_t size, maxsz;
2997         int r;
2998
2999         assert(context);
3000         assert(params);
3001         assert(id);
3002         assert(path);
3003         assert(unit);
3004         assert(read_dfd >= 0 || read_dfd == AT_FDCWD);
3005         assert(write_dfd >= 0);
3006         assert(left);
3007
3008         if (read_dfd >= 0) {
3009                 /* If a directory fd is specified, then read the file directly from that dir. In this case we
3010                  * won't do AF_UNIX stuff (we simply don't want to recursively iterate down a tree of AF_UNIX
3011                  * IPC sockets). It's OK if a file vanishes here in the time we enumerate it and intend to
3012                  * open it. */
3013
3014                 if (!filename_is_valid(path)) /* safety check */
3015                         return -EINVAL;
3016
3017                 missing_ok = true;
3018                 source = path;
3019
3020         } else if (path_is_absolute(path)) {
3021                 /* If this is an absolute path, read the data directly from it, and support AF_UNIX
3022                  * sockets */
3023
3024                 if (!path_is_valid(path)) /* safety check */
3025                         return -EINVAL;
3026
3027                 flags |= READ_FULL_FILE_CONNECT_SOCKET;
3028
3029                 /* Pass some minimal info about the unit and the credential name we are looking to acquire
3030                  * via the source socket address in case we read off an AF_UNIX socket. */
3031                 if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, id) < 0)
3032                         return -ENOMEM;
3033
3034                 missing_ok = false;
3035                 source = path;
3036
3037         } else if (credential_name_valid(path)) {
3038                 /* If this is a relative path, take it as credential name relative to the credentials
3039                  * directory we received ourselves. We don't support the AF_UNIX stuff in this mode, since we
3040                  * are operating on a credential store, i.e. this is guaranteed to be regular files. */
3041
3042                 search_path = credential_search_path(params, CREDENTIAL_SEARCH_PATH_ALL);
3043                 if (!search_path)
3044                         return -ENOMEM;
3045
3046                 missing_ok = true;
3047         } else
3048                 source = NULL;
3049
3050         if (encrypted)
3051                 flags |= READ_FULL_FILE_UNBASE64;
3052
3053         maxsz = encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX;
3054
3055         if (search_path) {
3056                 STRV_FOREACH(d, search_path) {
3057                         _cleanup_free_ char *j = NULL;
3058
3059                         j = path_join(*d, path);
3060                         if (!j)
3061                                 return -ENOMEM;
3062
3063                         r = read_full_file_full(
3064                                         AT_FDCWD, j, /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
3065                                         UINT64_MAX,
3066                                         maxsz,
3067                                         flags,
3068                                         NULL,
3069                                         &data, &size);
3070                         if (r != -ENOENT)
3071                                 break;
3072                 }
3073         } else if (source)
3074                 r = read_full_file_full(
3075                                 read_dfd, source,
3076                                 UINT64_MAX,
3077                                 maxsz,
3078                                 flags,
3079                                 bindname,
3080                                 &data, &size);
3081         else
3082                 r = -ENOENT;
3083
3084         if (r == -ENOENT && (missing_ok || hashmap_contains(context->set_credentials, id))) {
3085                 /* Make a missing inherited credential non-fatal, let's just continue. After all apps
3086                  * will get clear errors if we don't pass such a missing credential on as they
3087                  * themselves will get ENOENT when trying to read them, which should not be much
3088                  * worse than when we handle the error here and make it fatal.
3089                  *
3090                  * Also, if the source file doesn't exist, but a fallback is set via SetCredentials=
3091                  * we are fine, too. */
3092                 log_debug_errno(r, "Couldn't read inherited credential '%s', skipping: %m", path);
3093                 return 0;
3094         }
3095         if (r < 0)
3096                 return log_debug_errno(r, "Failed to read credential '%s': %m", path);
3097
3098         return maybe_decrypt_and_write_credential(write_dfd, id, encrypted, uid, ownership_ok, data, size, left);
3099 }
3100
3101 struct load_cred_args {
3102         const ExecContext *context;
3103         const ExecParameters *params;
3104         bool encrypted;
3105         const char *unit;
3106         int dfd;
3107         uid_t uid;
3108         bool ownership_ok;
3109         uint64_t *left;
3110 };
3111
3112 static int load_cred_recurse_dir_cb(
3113                 RecurseDirEvent event,
3114                 const char *path,
3115                 int dir_fd,
3116                 int inode_fd,
3117                 const struct dirent *de,
3118                 const struct statx *sx,
3119                 void *userdata) {
3120
3121         struct load_cred_args *args = ASSERT_PTR(userdata);
3122         _cleanup_free_ char *sub_id = NULL;
3123         int r;
3124
3125         if (event != RECURSE_DIR_ENTRY)
3126                 return RECURSE_DIR_CONTINUE;
3127
3128         if (!IN_SET(de->d_type, DT_REG, DT_SOCK))
3129                 return RECURSE_DIR_CONTINUE;
3130
3131         sub_id = strreplace(path, "/", "_");
3132         if (!sub_id)
3133                 return -ENOMEM;
3134
3135         if (!credential_name_valid(sub_id))
3136                 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Credential would get ID %s, which is not valid, refusing", sub_id);
3137
3138         if (faccessat(args->dfd, sub_id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0) {
3139                 log_debug("Skipping credential with duplicated ID %s at %s", sub_id, path);
3140                 return RECURSE_DIR_CONTINUE;
3141         }
3142         if (errno != ENOENT)
3143                 return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sub_id);
3144
3145         r = load_credential(
3146                         args->context,
3147                         args->params,
3148                         sub_id,
3149                         de->d_name,
3150                         args->encrypted,
3151                         args->unit,
3152                         dir_fd,
3153                         args->dfd,
3154                         args->uid,
3155                         args->ownership_ok,
3156                         args->left);
3157         if (r < 0)
3158                 return r;
3159
3160         return RECURSE_DIR_CONTINUE;
3161 }
3162
3163 static int acquire_credentials(
3164                 const ExecContext *context,
3165                 const ExecParameters *params,
3166                 const char *unit,
3167                 const char *p,
3168                 uid_t uid,
3169                 bool ownership_ok) {
3170
3171         uint64_t left = CREDENTIALS_TOTAL_SIZE_MAX;
3172         _cleanup_close_ int dfd = -EBADF;
3173         const char *ic;
3174         ExecLoadCredential *lc;
3175         ExecSetCredential *sc;
3176         int r;
3177
3178         assert(context);
3179         assert(p);
3180
3181         dfd = open(p, O_DIRECTORY|O_CLOEXEC);
3182         if (dfd < 0)
3183                 return -errno;
3184
3185         r = fd_acl_make_writable(dfd); /* Add the "w" bit, if we are reusing an already set up credentials dir where it was unset */
3186         if (r < 0)
3187                 return r;
3188
3189         /* First, load credentials off disk (or acquire via AF_UNIX socket) */
3190         HASHMAP_FOREACH(lc, context->load_credentials) {
3191                 _cleanup_close_ int sub_fd = -EBADF;
3192
3193                 /* If this is an absolute path, then try to open it as a directory. If that works, then we'll
3194                  * recurse into it. If it is an absolute path but it isn't a directory, then we'll open it as
3195                  * a regular file. Finally, if it's a relative path we will use it as a credential name to
3196                  * propagate a credential passed to us from further up. */
3197
3198                 if (path_is_absolute(lc->path)) {
3199                         sub_fd = open(lc->path, O_DIRECTORY|O_CLOEXEC|O_RDONLY);
3200                         if (sub_fd < 0 && !IN_SET(errno,
3201                                                   ENOTDIR,  /* Not a directory */
3202                                                   ENOENT))  /* Doesn't exist? */
3203                                 return log_debug_errno(errno, "Failed to open '%s': %m", lc->path);
3204                 }
3205
3206                 if (sub_fd < 0)
3207                         /* Regular file (incl. a credential passed in from higher up) */
3208                         r = load_credential(
3209                                         context,
3210                                         params,
3211                                         lc->id,
3212                                         lc->path,
3213                                         lc->encrypted,
3214                                         unit,
3215                                         AT_FDCWD,
3216                                         dfd,
3217                                         uid,
3218                                         ownership_ok,
3219                                         &left);
3220                 else
3221                         /* Directory */
3222                         r = recurse_dir(
3223                                         sub_fd,
3224                                         /* path= */ lc->id, /* recurse_dir() will suffix the subdir paths from here to the top-level id */
3225                                         /* statx_mask= */ 0,
3226                                         /* n_depth_max= */ UINT_MAX,
3227                                         RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT|RECURSE_DIR_ENSURE_TYPE,
3228                                         load_cred_recurse_dir_cb,
3229                                         &(struct load_cred_args) {
3230                                                 .context = context,
3231                                                 .params = params,
3232                                                 .encrypted = lc->encrypted,
3233                                                 .unit = unit,
3234                                                 .dfd = dfd,
3235                                                 .uid = uid,
3236                                                 .ownership_ok = ownership_ok,
3237                                                 .left = &left,
3238                                         });
3239                 if (r < 0)
3240                         return r;
3241         }
3242
3243         /* Next, look for system credentials and credentials in the credentials store. Note that these do not
3244          * override any credentials found earlier. */
3245         SET_FOREACH(ic, context->import_credentials) {
3246                 _cleanup_free_ char **search_path = NULL;
3247
3248                 search_path = credential_search_path(params, CREDENTIAL_SEARCH_PATH_TRUSTED);
3249                 if (!search_path)
3250                         return -ENOMEM;
3251
3252                 r = load_credential_glob(
3253                                 ic,
3254                                 /* encrypted = */ false,
3255                                 search_path,
3256                                 READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER,
3257                                 dfd,
3258                                 uid,
3259                                 ownership_ok,
3260                                 &left);
3261                 if (r < 0)
3262                         return r;
3263
3264                 search_path = strv_free(search_path);
3265                 search_path = credential_search_path(params, CREDENTIAL_SEARCH_PATH_ENCRYPTED);
3266                 if (!search_path)
3267                         return -ENOMEM;
3268
3269                 r = load_credential_glob(
3270                                 ic,
3271                                 /* encrypted = */ true,
3272                                 search_path,
3273                                 READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER|READ_FULL_FILE_UNBASE64,
3274                                 dfd,
3275                                 uid,
3276                                 ownership_ok,
3277                                 &left);
3278                 if (r < 0)
3279                         return r;
3280         }
3281
3282         /* Finally, we add in literally specified credentials. If the credentials already exist, we'll not
3283          * add them, so that they can act as a "default" if the same credential is specified multiple times. */
3284         HASHMAP_FOREACH(sc, context->set_credentials) {
3285                 _cleanup_(erase_and_freep) void *plaintext = NULL;
3286                 const char *data;
3287                 size_t size, add;
3288
3289                 /* Note that we check ahead of time here instead of relying on O_EXCL|O_CREAT later to return
3290                  * EEXIST if the credential already exists. That's because the TPM2-based decryption is kinda
3291                  * slow and involved, hence it's nice to be able to skip that if the credential already
3292                  * exists anyway. */
3293                 if (faccessat(dfd, sc->id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0)
3294                         continue;
3295                 if (errno != ENOENT)
3296                         return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sc->id);
3297
3298                 if (sc->encrypted) {
3299                         r = decrypt_credential_and_warn(sc->id, now(CLOCK_REALTIME), NULL, NULL, sc->data, sc->size, &plaintext, &size);
3300                         if (r < 0)
3301                                 return r;
3302
3303                         data = plaintext;
3304                 } else {
3305                         data = sc->data;
3306                         size = sc->size;
3307                 }
3308
3309                 add = strlen(sc->id) + size;
3310                 if (add > left)
3311                         return -E2BIG;
3312
3313                 r = write_credential(dfd, sc->id, data, size, uid, ownership_ok);
3314                 if (r < 0)
3315                         return r;
3316
3317                 left -= add;
3318         }
3319
3320         r = fd_acl_make_read_only(dfd); /* Now take away the "w" bit */
3321         if (r < 0)
3322                 return r;
3323
3324         /* After we created all keys with the right perms, also make sure the credential store as a whole is
3325          * accessible */
3326
3327         if (uid_is_valid(uid) && uid != getuid()) {
3328                 r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE);
3329                 if (r < 0) {
3330                         if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
3331                                 return r;
3332
3333                         if (!ownership_ok)
3334                                 return r;
3335
3336                         if (fchown(dfd, uid, GID_INVALID) < 0)
3337                                 return -errno;
3338                 }
3339         }
3340
3341         return 0;
3342 }
3343
3344 static int setup_credentials_internal(
3345                 const ExecContext *context,
3346                 const ExecParameters *params,
3347                 const char *unit,
3348                 const char *final,        /* This is where the credential store shall eventually end up at */
3349                 const char *workspace,    /* This is where we can prepare it before moving it to the final place */
3350                 bool reuse_workspace,     /* Whether to reuse any existing workspace mount if it already is a mount */
3351                 bool must_mount,          /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
3352                 uid_t uid) {
3353
3354         int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
3355                                    * if we mounted something; false if we definitely can't mount anything */
3356         bool final_mounted;
3357         const char *where;
3358
3359         assert(context);
3360         assert(final);
3361         assert(workspace);
3362
3363         if (reuse_workspace) {
3364                 r = path_is_mount_point(workspace, NULL, 0);
3365                 if (r < 0)
3366                         return r;
3367                 if (r > 0)
3368                         workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse it, let's keep this in mind */
3369                 else
3370                         workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
3371         } else
3372                 workspace_mounted = -1; /* ditto */
3373
3374         r = path_is_mount_point(final, NULL, 0);
3375         if (r < 0)
3376                 return r;
3377         if (r > 0) {
3378                 /* If the final place already has something mounted, we use that. If the workspace also has
3379                  * something mounted we assume it's actually the same mount (but with MS_RDONLY
3380                  * different). */
3381                 final_mounted = true;
3382
3383                 if (workspace_mounted < 0) {
3384                         /* If the final place is mounted, but the workspace isn't, then let's bind mount
3385                          * the final version to the workspace, and make it writable, so that we can make
3386                          * changes */
3387
3388                         r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
3389                         if (r < 0)
3390                                 return r;
3391
3392                         r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false), NULL);
3393                         if (r < 0)
3394                                 return r;
3395
3396                         workspace_mounted = true;
3397                 }
3398         } else
3399                 final_mounted = false;
3400
3401         if (workspace_mounted < 0) {
3402                 /* Nothing is mounted on the workspace yet, let's try to mount something now */
3403
3404                 r = mount_credentials_fs(workspace, CREDENTIALS_TOTAL_SIZE_MAX, /* ro= */ false);
3405                 if (r < 0) {
3406                         /* If that didn't work, try to make a bind mount from the final to the workspace, so that we can make it writable there. */
3407                         r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
3408                         if (r < 0) {
3409                                 if (!ERRNO_IS_PRIVILEGE(r)) /* Propagate anything that isn't a permission problem */
3410                                         return r;
3411
3412                                 if (must_mount) /* If we it's not OK to use the plain directory
3413                                                  * fallback, propagate all errors too */
3414                                         return r;
3415
3416                                 /* If we lack privileges to bind mount stuff, then let's gracefully
3417                                  * proceed for compat with container envs, and just use the final dir
3418                                  * as is. */
3419
3420                                 workspace_mounted = false;
3421                         } else {
3422                                 /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
3423                                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false), NULL);
3424                                 if (r < 0)
3425                                         return r;
3426
3427                                 workspace_mounted = true;
3428                         }
3429                 } else
3430                         workspace_mounted = true;
3431         }
3432
3433         assert(!must_mount || workspace_mounted > 0);
3434         where = workspace_mounted ? workspace : final;
3435
3436         (void) label_fix_full(AT_FDCWD, where, final, 0);
3437
3438         r = acquire_credentials(context, params, unit, where, uid, workspace_mounted);
3439         if (r < 0)
3440                 return r;
3441
3442         if (workspace_mounted) {
3443                 bool install;
3444
3445                 /* Determine if we should actually install the prepared mount in the final location by bind
3446                  * mounting it there. We do so only if the mount is not established there already, and if the
3447                  * mount is actually non-empty (i.e. carries at least one credential). Not that in the best
3448                  * case we are doing all this in a mount namespace, thus no one else will see that we
3449                  * allocated a file system we are getting rid of again here. */
3450                 if (final_mounted)
3451                         install = false; /* already installed */
3452                 else {
3453                         r = dir_is_empty(where, /* ignore_hidden_or_backup= */ false);
3454                         if (r < 0)
3455                                 return r;
3456
3457                         install = r == 0; /* install only if non-empty */
3458                 }
3459
3460                 if (install) {
3461                         /* Make workspace read-only now, so that any bind mount we make from it defaults to read-only too */
3462                         r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ true), NULL);
3463                         if (r < 0)
3464                                 return r;
3465
3466                         /* And mount it to the final place, read-only */
3467                         r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
3468                 } else
3469                         /* Otherwise get rid of it */
3470                         r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
3471                 if (r < 0)
3472                         return r;
3473         } else {
3474                 _cleanup_free_ char *parent = NULL;
3475
3476                 /* If we do not have our own mount put used the plain directory fallback, then we need to
3477                  * open access to the top-level credential directory and the per-service directory now */
3478
3479                 r = path_extract_directory(final, &parent);
3480                 if (r < 0)
3481                         return r;
3482                 if (chmod(parent, 0755) < 0)
3483                         return -errno;
3484         }
3485
3486         return 0;
3487 }
3488
3489 static int setup_credentials(
3490                 const ExecContext *context,
3491                 const ExecParameters *params,
3492                 const char *unit,
3493                 uid_t uid) {
3494
3495         _cleanup_free_ char *p = NULL, *q = NULL;
3496         int r;
3497
3498         assert(context);
3499         assert(params);
3500
3501         if (!exec_context_has_credentials(context))
3502                 return 0;
3503
3504         if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
3505                 return -EINVAL;
3506
3507         /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
3508          * and the subdir we mount over with a read-only file system readable by the service's user */
3509         q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials");
3510         if (!q)
3511                 return -ENOMEM;
3512
3513         r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */
3514         if (r < 0 && r != -EEXIST)
3515                 return r;
3516
3517         p = path_join(q, unit);
3518         if (!p)
3519                 return -ENOMEM;
3520
3521         r = mkdir_label(p, 0700); /* per-unit dir: private to user */
3522         if (r < 0 && r != -EEXIST)
3523                 return r;
3524
3525         r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
3526         if (r < 0) {
3527                 _cleanup_free_ char *t = NULL, *u = NULL;
3528
3529                 /* If this is not a privilege or support issue then propagate the error */
3530                 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
3531                         return r;
3532
3533                 /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
3534                  * it into place, so that users can't access half-initialized credential stores. */
3535                 t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
3536                 if (!t)
3537                         return -ENOMEM;
3538
3539                 /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
3540                  * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
3541                  * after it is fully set up */
3542                 u = path_join(t, unit);
3543                 if (!u)
3544                         return -ENOMEM;
3545
3546                 FOREACH_STRING(i, t, u) {
3547                         r = mkdir_label(i, 0700);
3548                         if (r < 0 && r != -EEXIST)
3549                                 return r;
3550                 }
3551
3552                 r = setup_credentials_internal(
3553                                 context,
3554                                 params,
3555                                 unit,
3556                                 p,       /* final mount point */
3557                                 u,       /* temporary workspace to overmount */
3558                                 true,    /* reuse the workspace if it is already a mount */
3559                                 false,   /* it's OK to fall back to a plain directory if we can't mount anything */
3560                                 uid);
3561
3562                 (void) rmdir(u); /* remove the workspace again if we can. */
3563
3564                 if (r < 0)
3565                         return r;
3566
3567         } else if (r == 0) {
3568
3569                 /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
3570                  * we can use the same directory for all cases, after turning off propagation. Question
3571                  * though is: where do we turn off propagation exactly, and where do we place the workspace
3572                  * directory? We need some place that is guaranteed to be a mount point in the host, and
3573                  * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
3574                  * since we ultimately want to move the resulting file system there, i.e. we need propagation
3575                  * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
3576                  * would be visible in the host mount table all the time, which we want to avoid. Hence, what
3577                  * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
3578                  * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
3579                  * propagation on the former, and then overmount the latter.
3580                  *
3581                  * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
3582                  * for this purpose, but there are few other candidates that work equally well for us, and
3583                  * given that the we do this in a privately namespaced short-lived single-threaded process
3584                  * that no one else sees this should be OK to do. */
3585
3586                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL); /* Turn off propagation from our namespace to host */
3587                 if (r < 0)
3588                         goto child_fail;
3589
3590                 r = setup_credentials_internal(
3591                                 context,
3592                                 params,
3593                                 unit,
3594                                 p,           /* final mount point */
3595                                 "/dev/shm",  /* temporary workspace to overmount */
3596                                 false,       /* do not reuse /dev/shm if it is already a mount, under no circumstances */
3597                                 true,        /* insist that something is mounted, do not allow fallback to plain directory */
3598                                 uid);
3599                 if (r < 0)
3600                         goto child_fail;
3601
3602                 _exit(EXIT_SUCCESS);
3603
3604         child_fail:
3605                 _exit(EXIT_FAILURE);
3606         }
3607
3608         /* If the credentials dir is empty and not a mount point, then there's no point in having it. Let's
3609          * try to remove it. This matters in particular if we created the dir as mount point but then didn't
3610          * actually end up mounting anything on it. In that case we'd rather have ENOENT than EACCESS being
3611          * seen by users when trying access this inode. */
3612         (void) rmdir(p);
3613         return 0;
3614 }
3615
3616 #if ENABLE_SMACK
3617 static int setup_smack(
3618                 const Manager *manager,
3619                 const ExecContext *context,
3620                 int executable_fd) {
3621         int r;
3622
3623         assert(context);
3624         assert(executable_fd >= 0);
3625
3626         if (context->smack_process_label) {
3627                 r = mac_smack_apply_pid(0, context->smack_process_label);
3628                 if (r < 0)
3629                         return r;
3630         } else if (manager->default_smack_process_label) {
3631                 _cleanup_free_ char *exec_label = NULL;
3632
3633                 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
3634                 if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
3635                         return r;
3636
3637                 r = mac_smack_apply_pid(0, exec_label ?: manager->default_smack_process_label);
3638                 if (r < 0)
3639                         return r;
3640         }
3641
3642         return 0;
3643 }
3644 #endif
3645
3646 static int compile_bind_mounts(
3647                 const ExecContext *context,
3648                 const ExecParameters *params,
3649                 BindMount **ret_bind_mounts,
3650                 size_t *ret_n_bind_mounts,
3651                 char ***ret_empty_directories) {
3652
3653         _cleanup_strv_free_ char **empty_directories = NULL;
3654         BindMount *bind_mounts = NULL;
3655         size_t n, h = 0;
3656         int r;
3657
3658         assert(context);
3659         assert(params);
3660         assert(ret_bind_mounts);
3661         assert(ret_n_bind_mounts);
3662         assert(ret_empty_directories);
3663
3664         CLEANUP_ARRAY(bind_mounts, h, bind_mount_free_many);
3665
3666         n = context->n_bind_mounts;
3667         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3668                 if (!params->prefix[t])
3669                         continue;
3670
3671                 for (size_t i = 0; i < context->directories[t].n_items; i++)
3672                         n += !context->directories[t].items[i].only_create;
3673         }
3674
3675         if (n <= 0) {
3676                 *ret_bind_mounts = NULL;
3677                 *ret_n_bind_mounts = 0;
3678                 *ret_empty_directories = NULL;
3679                 return 0;
3680         }
3681
3682         bind_mounts = new(BindMount, n);
3683         if (!bind_mounts)
3684                 return -ENOMEM;
3685
3686         for (size_t i = 0; i < context->n_bind_mounts; i++) {
3687                 BindMount *item = context->bind_mounts + i;
3688                 _cleanup_free_ char *s = NULL, *d = NULL;
3689
3690                 s = strdup(item->source);
3691                 if (!s)
3692                         return -ENOMEM;
3693
3694                 d = strdup(item->destination);
3695                 if (!d)
3696                         return -ENOMEM;
3697
3698                 bind_mounts[h++] = (BindMount) {
3699                         .source = TAKE_PTR(s),
3700                         .destination = TAKE_PTR(d),
3701                         .read_only = item->read_only,
3702                         .recursive = item->recursive,
3703                         .ignore_enoent = item->ignore_enoent,
3704                 };
3705         }
3706
3707         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3708                 if (!params->prefix[t])
3709                         continue;
3710
3711                 if (context->directories[t].n_items == 0)
3712                         continue;
3713
3714                 if (exec_directory_is_private(context, t) &&
3715                     !exec_context_with_rootfs(context)) {
3716                         char *private_root;
3717
3718                         /* So this is for a dynamic user, and we need to make sure the process can access its own
3719                          * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3720                          * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3721
3722                         private_root = path_join(params->prefix[t], "private");
3723                         if (!private_root)
3724                                 return -ENOMEM;
3725
3726                         r = strv_consume(&empty_directories, private_root);
3727                         if (r < 0)
3728                                 return r;
3729                 }
3730
3731                 for (size_t i = 0; i < context->directories[t].n_items; i++) {
3732                         _cleanup_free_ char *s = NULL, *d = NULL;
3733
3734                         /* When one of the parent directories is in the list, we cannot create the symlink
3735                          * for the child directory. See also the comments in setup_exec_directory(). */
3736                         if (context->directories[t].items[i].only_create)
3737                                 continue;
3738
3739                         if (exec_directory_is_private(context, t))
3740                                 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
3741                         else
3742                                 s = path_join(params->prefix[t], context->directories[t].items[i].path);
3743                         if (!s)
3744                                 return -ENOMEM;
3745
3746                         if (exec_directory_is_private(context, t) &&
3747                             exec_context_with_rootfs(context))
3748                                 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3749                                  * directory is not created on the root directory. So, let's bind-mount the directory
3750                                  * on the 'non-private' place. */
3751                                 d = path_join(params->prefix[t], context->directories[t].items[i].path);
3752                         else
3753                                 d = strdup(s);
3754                         if (!d)
3755                                 return -ENOMEM;
3756
3757                         bind_mounts[h++] = (BindMount) {
3758                                 .source = TAKE_PTR(s),
3759                                 .destination = TAKE_PTR(d),
3760                                 .read_only = false,
3761                                 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
3762                                 .recursive = true,
3763                                 .ignore_enoent = false,
3764                         };
3765                 }
3766         }
3767
3768         assert(h == n);
3769
3770         *ret_bind_mounts = TAKE_PTR(bind_mounts);
3771         *ret_n_bind_mounts = n;
3772         *ret_empty_directories = TAKE_PTR(empty_directories);
3773
3774         return (int) n;
3775 }
3776
3777 /* ret_symlinks will contain a list of pairs src:dest that describes
3778  * the symlinks to create later on. For example, the symlinks needed
3779  * to safely give private directories to DynamicUser=1 users. */
3780 static int compile_symlinks(
3781                 const ExecContext *context,
3782                 const ExecParameters *params,
3783                 char ***ret_symlinks) {
3784
3785         _cleanup_strv_free_ char **symlinks = NULL;
3786         int r;
3787
3788         assert(context);
3789         assert(params);
3790         assert(ret_symlinks);
3791
3792         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3793                 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
3794                         _cleanup_free_ char *private_path = NULL, *path = NULL;
3795
3796                         STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
3797                                 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
3798
3799                                 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3800                                 dst_abs = path_join(params->prefix[dt], *symlink);
3801                                 if (!src_abs || !dst_abs)
3802                                         return -ENOMEM;
3803
3804                                 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
3805                                 if (r < 0)
3806                                         return r;
3807                         }
3808
3809                         if (!exec_directory_is_private(context, dt) ||
3810                             exec_context_with_rootfs(context) ||
3811                             context->directories[dt].items[i].only_create)
3812                                 continue;
3813
3814                         private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
3815                         if (!private_path)
3816                                 return -ENOMEM;
3817
3818                         path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3819                         if (!path)
3820                                 return -ENOMEM;
3821
3822                         r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
3823                         if (r < 0)
3824                                 return r;
3825                 }
3826         }
3827
3828         *ret_symlinks = TAKE_PTR(symlinks);
3829
3830         return 0;
3831 }
3832
3833 static bool insist_on_sandboxing(
3834                 const ExecContext *context,
3835                 const char *root_dir,
3836                 const char *root_image,
3837                 const BindMount *bind_mounts,
3838                 size_t n_bind_mounts) {
3839
3840         assert(context);
3841         assert(n_bind_mounts == 0 || bind_mounts);
3842
3843         /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
3844          * would alter the view on the file system beyond making things read-only or invisible, i.e. would
3845          * rearrange stuff in a way we cannot ignore gracefully. */
3846
3847         if (context->n_temporary_filesystems > 0)
3848                 return true;
3849
3850         if (root_dir || root_image)
3851                 return true;
3852
3853         if (context->n_mount_images > 0)
3854                 return true;
3855
3856         if (context->dynamic_user)
3857                 return true;
3858
3859         if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
3860                 return true;
3861
3862         /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3863          * essential. */
3864         for (size_t i = 0; i < n_bind_mounts; i++)
3865                 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
3866                         return true;
3867
3868         if (context->log_namespace)
3869                 return true;
3870
3871         return false;
3872 }
3873
3874 static int setup_ephemeral(const ExecContext *context, ExecRuntime *runtime) {
3875         _cleanup_close_ int fd = -EBADF;
3876         int r;
3877
3878         if (!runtime || !runtime->ephemeral_copy)
3879                 return 0;
3880
3881         r = posix_lock(runtime->ephemeral_storage_socket[0], LOCK_EX);
3882         if (r < 0)
3883                 return log_debug_errno(r, "Failed to lock ephemeral storage socket: %m");
3884
3885         CLEANUP_POSIX_UNLOCK(runtime->ephemeral_storage_socket[0]);
3886
3887         fd = receive_one_fd(runtime->ephemeral_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
3888         if (fd >= 0)
3889                 /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */
3890                 return 0;
3891
3892         if (fd != -EAGAIN)
3893                 return log_debug_errno(fd, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
3894
3895         log_debug("Making ephemeral snapshot of %s to %s",
3896                   context->root_image ?: context->root_directory, runtime->ephemeral_copy);
3897
3898         if (context->root_image)
3899                 fd = copy_file(context->root_image, runtime->ephemeral_copy, O_EXCL, 0600,
3900                                COPY_LOCK_BSD|COPY_REFLINK|COPY_CRTIME);
3901         else
3902                 fd = btrfs_subvol_snapshot_at(AT_FDCWD, context->root_directory,
3903                                               AT_FDCWD, runtime->ephemeral_copy,
3904                                               BTRFS_SNAPSHOT_FALLBACK_COPY |
3905                                               BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
3906                                               BTRFS_SNAPSHOT_RECURSIVE |
3907                                               BTRFS_SNAPSHOT_LOCK_BSD);
3908         if (fd < 0)
3909                 return log_debug_errno(fd, "Failed to snapshot %s to %s: %m",
3910                                        context->root_image ?: context->root_directory, runtime->ephemeral_copy);
3911
3912         if (context->root_image) {
3913                 /* A root image might be subject to lots of random writes so let's try to disable COW on it
3914                  * which tends to not perform well in combination with lots of random writes.
3915                  *
3916                  * Note: btrfs actually isn't impressed by us setting the flag after making the reflink'ed
3917                  * copy, but we at least want to make the intention clear.
3918                  */
3919                 r = chattr_fd(fd, FS_NOCOW_FL, FS_NOCOW_FL, NULL);
3920                 if (r < 0)
3921                         log_debug_errno(fd, "Failed to disable copy-on-write for %s, ignoring: %m", runtime->ephemeral_copy);
3922         }
3923
3924         r = send_one_fd(runtime->ephemeral_storage_socket[1], fd, MSG_DONTWAIT);
3925         if (r < 0)
3926                 return log_debug_errno(r, "Failed to queue file descriptor on ephemeral storage socket: %m");
3927
3928         return 1;
3929 }
3930
3931 static int verity_settings_prepare(
3932                 VeritySettings *verity,
3933                 const char *root_image,
3934                 const void *root_hash,
3935                 size_t root_hash_size,
3936                 const char *root_hash_path,
3937                 const void *root_hash_sig,
3938                 size_t root_hash_sig_size,
3939                 const char *root_hash_sig_path,
3940                 const char *verity_data_path) {
3941
3942         int r;
3943
3944         assert(verity);
3945
3946         if (root_hash) {
3947                 void *d;
3948
3949                 d = memdup(root_hash, root_hash_size);
3950                 if (!d)
3951                         return -ENOMEM;
3952
3953                 free_and_replace(verity->root_hash, d);
3954                 verity->root_hash_size = root_hash_size;
3955                 verity->designator = PARTITION_ROOT;
3956         }
3957
3958         if (root_hash_sig) {
3959                 void *d;
3960
3961                 d = memdup(root_hash_sig, root_hash_sig_size);
3962                 if (!d)
3963                         return -ENOMEM;
3964
3965                 free_and_replace(verity->root_hash_sig, d);
3966                 verity->root_hash_sig_size = root_hash_sig_size;
3967                 verity->designator = PARTITION_ROOT;
3968         }
3969
3970         if (verity_data_path) {
3971                 r = free_and_strdup(&verity->data_path, verity_data_path);
3972                 if (r < 0)
3973                         return r;
3974         }
3975
3976         r = verity_settings_load(
3977                         verity,
3978                         root_image,
3979                         root_hash_path,
3980                         root_hash_sig_path);
3981         if (r < 0)
3982                 return log_debug_errno(r, "Failed to load root hash: %m");
3983
3984         return 0;
3985 }
3986
3987 static int apply_mount_namespace(
3988                 const Unit *u,
3989                 ExecCommandFlags command_flags,
3990                 const ExecContext *context,
3991                 const ExecParameters *params,
3992                 ExecRuntime *runtime,
3993                 const char *memory_pressure_path,
3994                 char **error_path) {
3995
3996         _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
3997         _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
3998                         **read_write_paths_cleanup = NULL;
3999         _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
4000                         *extension_dir = NULL;
4001         const char *root_dir = NULL, *root_image = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
4002         char **read_write_paths;
4003         NamespaceInfo ns_info;
4004         bool needs_sandboxing;
4005         BindMount *bind_mounts = NULL;
4006         size_t n_bind_mounts = 0;
4007         int r;
4008
4009         assert(context);
4010
4011         CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many);
4012
4013         if (params->flags & EXEC_APPLY_CHROOT) {
4014                 r = setup_ephemeral(context, runtime);
4015                 if (r < 0)
4016                         return r;
4017
4018                 if (context->root_image)
4019                         root_image = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_image;
4020                 else
4021                         root_dir = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory;
4022         }
4023
4024         r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
4025         if (r < 0)
4026                 return r;
4027
4028         /* Symlinks for exec dirs are set up after other mounts, before they are made read-only. */
4029         r = compile_symlinks(context, params, &symlinks);
4030         if (r < 0)
4031                 return r;
4032
4033         /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
4034          * service will need to write to it in order to start the notifications. */
4035         if (context->protect_control_groups && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
4036                 read_write_paths_cleanup = strv_copy(context->read_write_paths);
4037                 if (!read_write_paths_cleanup)
4038                         return -ENOMEM;
4039
4040                 r = strv_extend(&read_write_paths_cleanup, memory_pressure_path);
4041                 if (r < 0)
4042                         return r;
4043
4044                 read_write_paths = read_write_paths_cleanup;
4045         } else
4046                 read_write_paths = context->read_write_paths;
4047
4048         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
4049         if (needs_sandboxing) {
4050                 /* The runtime struct only contains the parent of the private /tmp,
4051                  * which is non-accessible to world users. Inside of it there's a /tmp
4052                  * that is sticky, and that's the one we want to use here.
4053                  * This does not apply when we are using /run/systemd/empty as fallback. */
4054
4055                 if (context->private_tmp && runtime && runtime->shared) {
4056                         if (streq_ptr(runtime->shared->tmp_dir, RUN_SYSTEMD_EMPTY))
4057                                 tmp_dir = runtime->shared->tmp_dir;
4058                         else if (runtime->shared->tmp_dir)
4059                                 tmp_dir = strjoina(runtime->shared->tmp_dir, "/tmp");
4060
4061                         if (streq_ptr(runtime->shared->var_tmp_dir, RUN_SYSTEMD_EMPTY))
4062                                 var_tmp_dir = runtime->shared->var_tmp_dir;
4063                         else if (runtime->shared->var_tmp_dir)
4064                                 var_tmp_dir = strjoina(runtime->shared->var_tmp_dir, "/tmp");
4065                 }
4066
4067                 ns_info = (NamespaceInfo) {
4068                         .ignore_protect_paths = false,
4069                         .private_dev = context->private_devices,
4070                         .protect_control_groups = context->protect_control_groups,
4071                         .protect_kernel_tunables = context->protect_kernel_tunables,
4072                         .protect_kernel_modules = context->protect_kernel_modules,
4073                         .protect_kernel_logs = context->protect_kernel_logs,
4074                         .protect_hostname = context->protect_hostname,
4075                         .mount_apivfs = exec_context_get_effective_mount_apivfs(context),
4076                         .protect_home = context->protect_home,
4077                         .protect_system = context->protect_system,
4078                         .protect_proc = context->protect_proc,
4079                         .proc_subset = context->proc_subset,
4080                         .private_network = exec_needs_network_namespace(context),
4081                         .private_ipc = exec_needs_ipc_namespace(context),
4082                         /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
4083                         .mount_nosuid = context->no_new_privileges && !mac_selinux_use(),
4084                 };
4085         } else if (!context->dynamic_user && root_dir)
4086                 /*
4087                  * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
4088                  * sandbox info, otherwise enforce it, don't ignore protected paths and
4089                  * fail if we are enable to apply the sandbox inside the mount namespace.
4090                  */
4091                 ns_info = (NamespaceInfo) {
4092                         .ignore_protect_paths = true,
4093                 };
4094         else
4095                 ns_info = (NamespaceInfo) {};
4096
4097         if (context->mount_propagation_flag == MS_SHARED)
4098                 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
4099
4100         if (exec_context_has_credentials(context) &&
4101             params->prefix[EXEC_DIRECTORY_RUNTIME] &&
4102             FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4103                 creds_path = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials", u->id);
4104                 if (!creds_path)
4105                         return -ENOMEM;
4106         }
4107
4108         if (params->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
4109                 propagate_dir = path_join("/run/systemd/propagate/", u->id);
4110                 if (!propagate_dir)
4111                         return -ENOMEM;
4112
4113                 incoming_dir = strdup("/run/systemd/incoming");
4114                 if (!incoming_dir)
4115                         return -ENOMEM;
4116
4117                 extension_dir = strdup("/run/systemd/unit-extensions");
4118                 if (!extension_dir)
4119                         return -ENOMEM;
4120         } else {
4121                 assert(params->runtime_scope == RUNTIME_SCOPE_USER);
4122
4123                 if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0)
4124                         return -ENOMEM;
4125         }
4126
4127         if (root_image) {
4128                 r = verity_settings_prepare(
4129                         &verity,
4130                         root_image,
4131                         context->root_hash, context->root_hash_size, context->root_hash_path,
4132                         context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
4133                         context->root_verity);
4134                 if (r < 0)
4135                         return r;
4136         }
4137
4138         r = setup_namespace(
4139                         root_dir,
4140                         root_image,
4141                         context->root_image_options,
4142                         context->root_image_policy ?: &image_policy_service,
4143                         &ns_info,
4144                         read_write_paths,
4145                         needs_sandboxing ? context->read_only_paths : NULL,
4146                         needs_sandboxing ? context->inaccessible_paths : NULL,
4147                         needs_sandboxing ? context->exec_paths : NULL,
4148                         needs_sandboxing ? context->no_exec_paths : NULL,
4149                         empty_directories,
4150                         symlinks,
4151                         bind_mounts,
4152                         n_bind_mounts,
4153                         context->temporary_filesystems,
4154                         context->n_temporary_filesystems,
4155                         context->mount_images,
4156                         context->n_mount_images,
4157                         context->mount_image_policy ?: &image_policy_service,
4158                         tmp_dir,
4159                         var_tmp_dir,
4160                         creds_path,
4161                         context->log_namespace,
4162                         context->mount_propagation_flag,
4163                         &verity,
4164                         context->extension_images,
4165                         context->n_extension_images,
4166                         context->extension_image_policy ?: &image_policy_sysext,
4167                         context->extension_directories,
4168                         propagate_dir,
4169                         incoming_dir,
4170                         extension_dir,
4171                         root_dir || root_image ? params->notify_socket : NULL,
4172                         error_path);
4173
4174         /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
4175          * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
4176          * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
4177          * completely different execution environment. */
4178         if (r == -ENOANO) {
4179                 if (insist_on_sandboxing(
4180                                     context,
4181                                     root_dir, root_image,
4182                                     bind_mounts,
4183                                     n_bind_mounts))
4184                         return log_unit_debug_errno(u,
4185                                                     SYNTHETIC_ERRNO(EOPNOTSUPP),
4186                                                     "Failed to set up namespace, and refusing to continue since "
4187                                                     "the selected namespacing options alter mount environment non-trivially.\n"
4188                                                     "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
4189                                                     n_bind_mounts,
4190                                                     context->n_temporary_filesystems,
4191                                                     yes_no(root_dir),
4192                                                     yes_no(root_image),
4193                                                     yes_no(context->dynamic_user));
4194
4195                 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
4196                 return 0;
4197         }
4198
4199         return r;
4200 }
4201
4202 static int apply_working_directory(
4203                 const ExecContext *context,
4204                 const ExecParameters *params,
4205                 ExecRuntime *runtime,
4206                 const char *home,
4207                 int *exit_status) {
4208
4209         const char *d, *wd;
4210
4211         assert(context);
4212         assert(exit_status);
4213
4214         if (context->working_directory_home) {
4215
4216                 if (!home) {
4217                         *exit_status = EXIT_CHDIR;
4218                         return -ENXIO;
4219                 }
4220
4221                 wd = home;
4222
4223         } else
4224                 wd = empty_to_root(context->working_directory);
4225
4226         if (params->flags & EXEC_APPLY_CHROOT)
4227                 d = wd;
4228         else
4229                 d = prefix_roota((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory, wd);
4230
4231         if (chdir(d) < 0 && !context->working_directory_missing_ok) {
4232                 *exit_status = EXIT_CHDIR;
4233                 return -errno;
4234         }
4235
4236         return 0;
4237 }
4238
4239 static int apply_root_directory(
4240                 const ExecContext *context,
4241                 const ExecParameters *params,
4242                 ExecRuntime *runtime,
4243                 const bool needs_mount_ns,
4244                 int *exit_status) {
4245
4246         assert(context);
4247         assert(exit_status);
4248
4249         if (params->flags & EXEC_APPLY_CHROOT)
4250                 if (!needs_mount_ns && context->root_directory)
4251                         if (chroot((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory) < 0) {
4252                                 *exit_status = EXIT_CHROOT;
4253                                 return -errno;
4254                         }
4255
4256         return 0;
4257 }
4258
4259 static int setup_keyring(
4260                 const Unit *u,
4261                 const ExecContext *context,
4262                 const ExecParameters *p,
4263                 uid_t uid, gid_t gid) {
4264
4265         key_serial_t keyring;
4266         int r = 0;
4267         uid_t saved_uid;
4268         gid_t saved_gid;
4269
4270         assert(u);
4271         assert(context);
4272         assert(p);
4273
4274         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
4275          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
4276          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
4277          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
4278          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
4279          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
4280
4281         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
4282                 return 0;
4283
4284         /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
4285          * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
4286          * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
4287          * & group is just as nasty as acquiring a reference to the user keyring. */
4288
4289         saved_uid = getuid();
4290         saved_gid = getgid();
4291
4292         if (gid_is_valid(gid) && gid != saved_gid) {
4293                 if (setregid(gid, -1) < 0)
4294                         return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
4295         }
4296
4297         if (uid_is_valid(uid) && uid != saved_uid) {
4298                 if (setreuid(uid, -1) < 0) {
4299                         r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
4300                         goto out;
4301                 }
4302         }
4303
4304         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
4305         if (keyring == -1) {
4306                 if (errno == ENOSYS)
4307                         log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
4308                 else if (ERRNO_IS_PRIVILEGE(errno))
4309                         log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
4310                 else if (errno == EDQUOT)
4311                         log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
4312                 else
4313                         r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
4314
4315                 goto out;
4316         }
4317
4318         /* When requested link the user keyring into the session keyring. */
4319         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
4320
4321                 if (keyctl(KEYCTL_LINK,
4322                            KEY_SPEC_USER_KEYRING,
4323                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
4324                         r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
4325                         goto out;
4326                 }
4327         }
4328
4329         /* Restore uid/gid back */
4330         if (uid_is_valid(uid) && uid != saved_uid) {
4331                 if (setreuid(saved_uid, -1) < 0) {
4332                         r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
4333                         goto out;
4334                 }
4335         }
4336
4337         if (gid_is_valid(gid) && gid != saved_gid) {
4338                 if (setregid(saved_gid, -1) < 0)
4339                         return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
4340         }
4341
4342         /* Populate they keyring with the invocation ID by default, as original saved_uid. */
4343         if (!sd_id128_is_null(u->invocation_id)) {
4344                 key_serial_t key;
4345
4346                 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
4347                 if (key == -1)
4348                         log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
4349                 else {
4350                         if (keyctl(KEYCTL_SETPERM, key,
4351                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
4352                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
4353                                 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
4354                 }
4355         }
4356
4357 out:
4358         /* Revert back uid & gid for the last time, and exit */
4359         /* no extra logging, as only the first already reported error matters */
4360         if (getuid() != saved_uid)
4361                 (void) setreuid(saved_uid, -1);
4362
4363         if (getgid() != saved_gid)
4364                 (void) setregid(saved_gid, -1);
4365
4366         return r;
4367 }
4368
4369 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
4370         assert(array);
4371         assert(n);
4372         assert(pair);
4373
4374         if (pair[0] >= 0)
4375                 array[(*n)++] = pair[0];
4376         if (pair[1] >= 0)
4377                 array[(*n)++] = pair[1];
4378 }
4379
4380 static int close_remaining_fds(
4381                 const ExecParameters *params,
4382                 const ExecRuntime *runtime,
4383                 int user_lookup_fd,
4384                 int socket_fd,
4385                 const int *fds, size_t n_fds) {
4386
4387         size_t n_dont_close = 0;
4388         int dont_close[n_fds + 14];
4389
4390         assert(params);
4391
4392         if (params->stdin_fd >= 0)
4393                 dont_close[n_dont_close++] = params->stdin_fd;
4394         if (params->stdout_fd >= 0)
4395                 dont_close[n_dont_close++] = params->stdout_fd;
4396         if (params->stderr_fd >= 0)
4397                 dont_close[n_dont_close++] = params->stderr_fd;
4398
4399         if (socket_fd >= 0)
4400                 dont_close[n_dont_close++] = socket_fd;
4401         if (n_fds > 0) {
4402                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
4403                 n_dont_close += n_fds;
4404         }
4405
4406         if (runtime)
4407                 append_socket_pair(dont_close, &n_dont_close, runtime->ephemeral_storage_socket);
4408
4409         if (runtime && runtime->shared) {
4410                 append_socket_pair(dont_close, &n_dont_close, runtime->shared->netns_storage_socket);
4411                 append_socket_pair(dont_close, &n_dont_close, runtime->shared->ipcns_storage_socket);
4412         }
4413
4414         if (runtime && runtime->dynamic_creds) {
4415                 if (runtime->dynamic_creds->user)
4416                         append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->user->storage_socket);
4417                 if (runtime->dynamic_creds->group)
4418                         append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->group->storage_socket);
4419         }
4420
4421         if (user_lookup_fd >= 0)
4422                 dont_close[n_dont_close++] = user_lookup_fd;
4423
4424         return close_all_fds(dont_close, n_dont_close);
4425 }
4426
4427 static int send_user_lookup(
4428                 Unit *unit,
4429                 int user_lookup_fd,
4430                 uid_t uid,
4431                 gid_t gid) {
4432
4433         assert(unit);
4434
4435         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
4436          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
4437          * specified. */
4438
4439         if (user_lookup_fd < 0)
4440                 return 0;
4441
4442         if (!uid_is_valid(uid) && !gid_is_valid(gid))
4443                 return 0;
4444
4445         if (writev(user_lookup_fd,
4446                (struct iovec[]) {
4447                            IOVEC_MAKE(&uid, sizeof(uid)),
4448                            IOVEC_MAKE(&gid, sizeof(gid)),
4449                            IOVEC_MAKE_STRING(unit->id) }, 3) < 0)
4450                 return -errno;
4451
4452         return 0;
4453 }
4454
4455 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
4456         int r;
4457
4458         assert(c);
4459         assert(home);
4460         assert(buf);
4461
4462         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
4463
4464         if (*home)
4465                 return 0;
4466
4467         if (!c->working_directory_home)
4468                 return 0;
4469
4470         r = get_home_dir(buf);
4471         if (r < 0)
4472                 return r;
4473
4474         *home = *buf;
4475         return 1;
4476 }
4477
4478 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
4479         _cleanup_strv_free_ char ** list = NULL;
4480         int r;
4481
4482         assert(c);
4483         assert(p);
4484         assert(ret);
4485
4486         assert(c->dynamic_user);
4487
4488         /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
4489          * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
4490          * directories. */
4491
4492         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
4493                 if (t == EXEC_DIRECTORY_CONFIGURATION)
4494                         continue;
4495
4496                 if (!p->prefix[t])
4497                         continue;
4498
4499                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
4500                         char *e;
4501
4502                         if (exec_directory_is_private(c, t))
4503                                 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
4504                         else
4505                                 e = path_join(p->prefix[t], c->directories[t].items[i].path);
4506                         if (!e)
4507                                 return -ENOMEM;
4508
4509                         r = strv_consume(&list, e);
4510                         if (r < 0)
4511                                 return r;
4512                 }
4513         }
4514
4515         *ret = TAKE_PTR(list);
4516
4517         return 0;
4518 }
4519
4520 static int exec_parameters_get_cgroup_path(
4521                 const ExecParameters *params,
4522                 const CGroupContext *c,
4523                 char **ret) {
4524
4525         const char *subgroup = NULL;
4526         char *p;
4527
4528         assert(params);
4529         assert(ret);
4530
4531         if (!params->cgroup_path)
4532                 return -EINVAL;
4533
4534         /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
4535          * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
4536          * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
4537          * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
4538          * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
4539          * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
4540          * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
4541          * flag, which is only passed for the former statements, not for the latter. */
4542
4543         if (FLAGS_SET(params->flags, EXEC_CGROUP_DELEGATE) && (FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP) || c->delegate_subgroup)) {
4544                 if (FLAGS_SET(params->flags, EXEC_IS_CONTROL))
4545                         subgroup = ".control";
4546                 else
4547                         subgroup = c->delegate_subgroup;
4548         }
4549
4550         if (subgroup)
4551                 p = path_join(params->cgroup_path, subgroup);
4552         else
4553                 p = strdup(params->cgroup_path);
4554         if (!p)
4555                 return -ENOMEM;
4556
4557         *ret = p;
4558         return !!subgroup;
4559 }
4560
4561 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
4562         _cleanup_(cpu_set_reset) CPUSet s = {};
4563         int r;
4564
4565         assert(c);
4566         assert(ret);
4567
4568         if (!c->numa_policy.nodes.set) {
4569                 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
4570                 return 0;
4571         }
4572
4573         r = numa_to_cpu_set(&c->numa_policy, &s);
4574         if (r < 0)
4575                 return r;
4576
4577         cpu_set_reset(ret);
4578
4579         return cpu_set_add_all(ret, &s);
4580 }
4581
4582 bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
4583         assert(c);
4584
4585         return c->cpu_affinity_from_numa;
4586 }
4587
4588 static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
4589         int r;
4590
4591         assert(fds);
4592         assert(n_fds);
4593         assert(*n_fds < fds_size);
4594         assert(ret_fd);
4595
4596         if (fd < 0) {
4597                 *ret_fd = -EBADF;
4598                 return 0;
4599         }
4600
4601         if (fd < 3 + (int) *n_fds) {
4602                 /* Let's move the fd up, so that it's outside of the fd range we will use to store
4603                  * the fds we pass to the process (or which are closed only during execve). */
4604
4605                 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
4606                 if (r < 0)
4607                         return -errno;
4608
4609                 close_and_replace(fd, r);
4610         }
4611
4612         *ret_fd = fds[*n_fds] = fd;
4613         (*n_fds) ++;
4614         return 1;
4615 }
4616
4617 static int connect_unix_harder(Unit *u, const OpenFile *of, int ofd) {
4618         union sockaddr_union addr = {
4619                 .un.sun_family = AF_UNIX,
4620         };
4621         socklen_t sa_len;
4622         static const int socket_types[] = { SOCK_DGRAM, SOCK_STREAM, SOCK_SEQPACKET };
4623         int r;
4624
4625         assert(u);
4626         assert(of);
4627         assert(ofd >= 0);
4628
4629         r = sockaddr_un_set_path(&addr.un, FORMAT_PROC_FD_PATH(ofd));
4630         if (r < 0)
4631                 return log_unit_error_errno(u, r, "Failed to set sockaddr for %s: %m", of->path);
4632
4633         sa_len = r;
4634
4635         for (size_t i = 0; i < ELEMENTSOF(socket_types); i++) {
4636                 _cleanup_close_ int fd = -EBADF;
4637
4638                 fd = socket(AF_UNIX, socket_types[i] | SOCK_CLOEXEC, 0);
4639                 if (fd < 0)
4640                         return log_unit_error_errno(u, errno, "Failed to create socket for %s: %m", of->path);
4641
4642                 r = RET_NERRNO(connect(fd, &addr.sa, sa_len));
4643                 if (r == -EPROTOTYPE)
4644                         continue;
4645                 if (r < 0)
4646                         return log_unit_error_errno(u, r, "Failed to connect socket for %s: %m", of->path);
4647
4648                 return TAKE_FD(fd);
4649         }
4650
4651         return log_unit_error_errno(u, SYNTHETIC_ERRNO(EPROTOTYPE), "Failed to connect socket for \"%s\".", of->path);
4652 }
4653
4654 static int get_open_file_fd(Unit *u, const OpenFile *of) {
4655         struct stat st;
4656         _cleanup_close_ int fd = -EBADF, ofd = -EBADF;
4657
4658         assert(u);
4659         assert(of);
4660
4661         ofd = open(of->path, O_PATH | O_CLOEXEC);
4662         if (ofd < 0)
4663                 return log_unit_error_errno(u, errno, "Could not open \"%s\": %m", of->path);
4664
4665         if (fstat(ofd, &st) < 0)
4666                 return log_unit_error_errno(u, errno, "Failed to stat %s: %m", of->path);
4667
4668         if (S_ISSOCK(st.st_mode)) {
4669                 fd = connect_unix_harder(u, of, ofd);
4670                 if (fd < 0)
4671                         return fd;
4672
4673                 if (FLAGS_SET(of->flags, OPENFILE_READ_ONLY) && shutdown(fd, SHUT_WR) < 0)
4674                         return log_unit_error_errno(u, errno, "Failed to shutdown send for socket %s: %m",
4675                                                     of->path);
4676
4677                 log_unit_debug(u, "socket %s opened (fd=%d)", of->path, fd);
4678         } else {
4679                 int flags = FLAGS_SET(of->flags, OPENFILE_READ_ONLY) ? O_RDONLY : O_RDWR;
4680                 if (FLAGS_SET(of->flags, OPENFILE_APPEND))
4681                         flags |= O_APPEND;
4682                 else if (FLAGS_SET(of->flags, OPENFILE_TRUNCATE))
4683                         flags |= O_TRUNC;
4684
4685                 fd = fd_reopen(ofd, flags | O_CLOEXEC);
4686                 if (fd < 0)
4687                         return log_unit_error_errno(u, fd, "Failed to open file %s: %m", of->path);
4688
4689                 log_unit_debug(u, "file %s opened (fd=%d)", of->path, fd);
4690         }
4691
4692         return TAKE_FD(fd);
4693 }
4694
4695 static int collect_open_file_fds(
4696                 Unit *u,
4697                 OpenFile* open_files,
4698                 int **fds,
4699                 char ***fdnames,
4700                 size_t *n_fds) {
4701         int r;
4702
4703         assert(u);
4704         assert(fds);
4705         assert(fdnames);
4706         assert(n_fds);
4707
4708         LIST_FOREACH(open_files, of, open_files) {
4709                 _cleanup_close_ int fd = -EBADF;
4710
4711                 fd = get_open_file_fd(u, of);
4712                 if (fd < 0) {
4713                         if (FLAGS_SET(of->flags, OPENFILE_GRACEFUL)) {
4714                                 log_unit_debug_errno(u, fd, "Failed to get OpenFile= file descriptor for %s, ignoring: %m", of->path);
4715                                 continue;
4716                         }
4717
4718                         return fd;
4719                 }
4720
4721                 if (!GREEDY_REALLOC(*fds, *n_fds + 1))
4722                         return -ENOMEM;
4723
4724                 r = strv_extend(fdnames, of->fdname);
4725                 if (r < 0)
4726                         return r;
4727
4728                 (*fds)[*n_fds] = TAKE_FD(fd);
4729
4730                 (*n_fds)++;
4731         }
4732
4733         return 0;
4734 }
4735
4736 static void log_command_line(Unit *unit, const char *msg, const char *executable, char **argv) {
4737         assert(unit);
4738         assert(msg);
4739         assert(executable);
4740
4741         if (!DEBUG_LOGGING)
4742                 return;
4743
4744         _cleanup_free_ char *cmdline = quote_command_line(argv, SHELL_ESCAPE_EMPTY);
4745
4746         log_unit_struct(unit, LOG_DEBUG,
4747                         "EXECUTABLE=%s", executable,
4748                         LOG_UNIT_MESSAGE(unit, "%s: %s", msg, strnull(cmdline)),
4749                         LOG_UNIT_INVOCATION_ID(unit));
4750 }
4751
4752 static bool exec_context_need_unprivileged_private_users(
4753                 const ExecContext *context,
4754                 const ExecParameters *params) {
4755
4756         assert(context);
4757         assert(params);
4758
4759         /* These options require PrivateUsers= when used in user units, as we need to be in a user namespace
4760          * to have permission to enable them when not running as root. If we have effective CAP_SYS_ADMIN
4761          * (system manager) then we have privileges and don't need this. */
4762         if (params->runtime_scope != RUNTIME_SCOPE_USER)
4763                 return false;
4764
4765         return context->private_users ||
4766                context->private_tmp ||
4767                context->private_devices ||
4768                context->private_network ||
4769                context->network_namespace_path ||
4770                context->private_ipc ||
4771                context->ipc_namespace_path ||
4772                context->private_mounts > 0 ||
4773                context->mount_apivfs ||
4774                context->n_bind_mounts > 0 ||
4775                context->n_temporary_filesystems > 0 ||
4776                context->root_directory ||
4777                !strv_isempty(context->extension_directories) ||
4778                context->protect_system != PROTECT_SYSTEM_NO ||
4779                context->protect_home != PROTECT_HOME_NO ||
4780                context->protect_kernel_tunables ||
4781                context->protect_kernel_modules ||
4782                context->protect_kernel_logs ||
4783                context->protect_control_groups ||
4784                context->protect_clock ||
4785                context->protect_hostname ||
4786                !strv_isempty(context->read_write_paths) ||
4787                !strv_isempty(context->read_only_paths) ||
4788                !strv_isempty(context->inaccessible_paths) ||
4789                !strv_isempty(context->exec_paths) ||
4790                !strv_isempty(context->no_exec_paths);
4791 }
4792
4793 static int exec_child(
4794                 Unit *unit,
4795                 const ExecCommand *command,
4796                 const ExecContext *context,
4797                 const ExecParameters *params,
4798                 ExecRuntime *runtime,
4799                 const CGroupContext *cgroup_context,
4800                 int socket_fd,
4801                 const int named_iofds[static 3],
4802                 int *params_fds,
4803                 size_t n_socket_fds,
4804                 size_t n_storage_fds,
4805                 char **files_env,
4806                 int user_lookup_fd,
4807                 int *exit_status) {
4808
4809         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
4810         int r, ngids = 0, exec_fd;
4811         _cleanup_free_ gid_t *supplementary_gids = NULL;
4812         const char *username = NULL, *groupname = NULL;
4813         _cleanup_free_ char *home_buffer = NULL, *memory_pressure_path = NULL;
4814         const char *home = NULL, *shell = NULL;
4815         char **final_argv = NULL;
4816         dev_t journal_stream_dev = 0;
4817         ino_t journal_stream_ino = 0;
4818         bool userns_set_up = false;
4819         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
4820                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
4821                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
4822                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
4823 #if HAVE_SELINUX
4824         _cleanup_free_ char *mac_selinux_context_net = NULL;
4825         bool use_selinux = false;
4826 #endif
4827 #if ENABLE_SMACK
4828         bool use_smack = false;
4829 #endif
4830 #if HAVE_APPARMOR
4831         bool use_apparmor = false;
4832 #endif
4833         uid_t saved_uid = getuid();
4834         gid_t saved_gid = getgid();
4835         uid_t uid = UID_INVALID;
4836         gid_t gid = GID_INVALID;
4837         size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
4838                n_keep_fds; /* total number of fds not to close */
4839         int secure_bits;
4840         _cleanup_free_ gid_t *gids_after_pam = NULL;
4841         int ngids_after_pam = 0;
4842         _cleanup_free_ int *fds = NULL;
4843         _cleanup_strv_free_ char **fdnames = NULL;
4844
4845         assert(unit);
4846         assert(command);
4847         assert(context);
4848         assert(params);
4849         assert(exit_status);
4850
4851         /* Explicitly test for CVE-2021-4034 inspired invocations */
4852         assert(command->path);
4853         assert(!strv_isempty(command->argv));
4854
4855         rename_process_from_path(command->path);
4856
4857         /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4858          * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4859          * both of which will be demoted to SIG_DFL. */
4860         (void) default_signals(SIGNALS_CRASH_HANDLER,
4861                                SIGNALS_IGNORE);
4862
4863         if (context->ignore_sigpipe)
4864                 (void) ignore_signals(SIGPIPE);
4865
4866         r = reset_signal_mask();
4867         if (r < 0) {
4868                 *exit_status = EXIT_SIGNAL_MASK;
4869                 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
4870         }
4871
4872         if (params->idle_pipe)
4873                 do_idle_pipe_dance(params->idle_pipe);
4874
4875         /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4876          * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4877          * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4878          * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
4879
4880         log_forget_fds();
4881         log_set_open_when_needed(true);
4882         log_settle_target();
4883
4884         /* In case anything used libc syslog(), close this here, too */
4885         closelog();
4886
4887         fds = newdup(int, params_fds, n_fds);
4888         if (!fds) {
4889                 *exit_status = EXIT_MEMORY;
4890                 return log_oom();
4891         }
4892
4893         fdnames = strv_copy((char**) params->fd_names);
4894         if (!fdnames) {
4895                 *exit_status = EXIT_MEMORY;
4896                 return log_oom();
4897         }
4898
4899         r = collect_open_file_fds(unit, params->open_files, &fds, &fdnames, &n_fds);
4900         if (r < 0) {
4901                 *exit_status = EXIT_FDS;
4902                 return log_unit_error_errno(unit, r, "Failed to get OpenFile= file descriptors: %m");
4903         }
4904
4905         int keep_fds[n_fds + 3];
4906         memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
4907         n_keep_fds = n_fds;
4908
4909         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
4910         if (r < 0) {
4911                 *exit_status = EXIT_FDS;
4912                 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4913         }
4914
4915 #if HAVE_LIBBPF
4916         if (unit->manager->restrict_fs) {
4917                 int bpf_map_fd = lsm_bpf_map_restrict_fs_fd(unit);
4918                 if (bpf_map_fd < 0) {
4919                         *exit_status = EXIT_FDS;
4920                         return log_unit_error_errno(unit, bpf_map_fd, "Failed to get restrict filesystems BPF map fd: %m");
4921                 }
4922
4923                 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, bpf_map_fd, &bpf_map_fd);
4924                 if (r < 0) {
4925                         *exit_status = EXIT_FDS;
4926                         return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4927                 }
4928         }
4929 #endif
4930
4931         r = close_remaining_fds(params, runtime, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
4932         if (r < 0) {
4933                 *exit_status = EXIT_FDS;
4934                 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
4935         }
4936
4937         if (!context->same_pgrp &&
4938             setsid() < 0) {
4939                 *exit_status = EXIT_SETSID;
4940                 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
4941         }
4942
4943         exec_context_tty_reset(context, params);
4944
4945         if (unit_shall_confirm_spawn(unit)) {
4946                 _cleanup_free_ char *cmdline = NULL;
4947
4948                 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
4949                 if (!cmdline) {
4950                         *exit_status = EXIT_MEMORY;
4951                         return log_oom();
4952                 }
4953
4954                 r = ask_for_confirmation(context, params->confirm_spawn, unit, cmdline);
4955                 if (r != CONFIRM_EXECUTE) {
4956                         if (r == CONFIRM_PRETEND_SUCCESS) {
4957                                 *exit_status = EXIT_SUCCESS;
4958                                 return 0;
4959                         }
4960                         *exit_status = EXIT_CONFIRM;
4961                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
4962                                                     "Execution cancelled by the user");
4963                 }
4964         }
4965
4966         /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4967          * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4968          * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4969          * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4970          * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4971         if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
4972             setenv("SYSTEMD_ACTIVATION_SCOPE", runtime_scope_to_string(params->runtime_scope), true) != 0) {
4973                 *exit_status = EXIT_MEMORY;
4974                 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4975         }
4976
4977         if (context->dynamic_user && runtime && runtime->dynamic_creds) {
4978                 _cleanup_strv_free_ char **suggested_paths = NULL;
4979
4980                 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
4981                  * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
4982                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4983                         *exit_status = EXIT_USER;
4984                         return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4985                 }
4986
4987                 r = compile_suggested_paths(context, params, &suggested_paths);
4988                 if (r < 0) {
4989                         *exit_status = EXIT_MEMORY;
4990                         return log_oom();
4991                 }
4992
4993                 r = dynamic_creds_realize(runtime->dynamic_creds, suggested_paths, &uid, &gid);
4994                 if (r < 0) {
4995                         *exit_status = EXIT_USER;
4996                         if (r == -EILSEQ)
4997                                 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4998                                                             "Failed to update dynamic user credentials: User or group with specified name already exists.");
4999                         return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
5000                 }
5001
5002                 if (!uid_is_valid(uid)) {
5003                         *exit_status = EXIT_USER;
5004                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
5005                 }
5006
5007                 if (!gid_is_valid(gid)) {
5008                         *exit_status = EXIT_USER;
5009                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
5010                 }
5011
5012                 if (runtime->dynamic_creds->user)
5013                         username = runtime->dynamic_creds->user->name;
5014
5015         } else {
5016                 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
5017                 if (r < 0) {
5018                         *exit_status = EXIT_USER;
5019                         return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
5020                 }
5021
5022                 r = get_fixed_group(context, &groupname, &gid);
5023                 if (r < 0) {
5024                         *exit_status = EXIT_GROUP;
5025                         return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
5026                 }
5027         }
5028
5029         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
5030         r = get_supplementary_groups(context, username, groupname, gid,
5031                                      &supplementary_gids, &ngids);
5032         if (r < 0) {
5033                 *exit_status = EXIT_GROUP;
5034                 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
5035         }
5036
5037         r = send_user_lookup(unit, user_lookup_fd, uid, gid);
5038         if (r < 0) {
5039                 *exit_status = EXIT_USER;
5040                 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
5041         }
5042
5043         user_lookup_fd = safe_close(user_lookup_fd);
5044
5045         r = acquire_home(context, uid, &home, &home_buffer);
5046         if (r < 0) {
5047                 *exit_status = EXIT_CHDIR;
5048                 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
5049         }
5050
5051         /* If a socket is connected to STDIN/STDOUT/STDERR, we must drop O_NONBLOCK */
5052         if (socket_fd >= 0)
5053                 (void) fd_nonblock(socket_fd, false);
5054
5055         /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
5056          * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
5057         if (params->cgroup_path) {
5058                 _cleanup_free_ char *p = NULL;
5059
5060                 r = exec_parameters_get_cgroup_path(params, cgroup_context, &p);
5061                 if (r < 0) {
5062                         *exit_status = EXIT_CGROUP;
5063                         return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
5064                 }
5065
5066                 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
5067                 if (r == -EUCLEAN) {
5068                         *exit_status = EXIT_CGROUP;
5069                         return log_unit_error_errno(unit, r, "Failed to attach process to cgroup %s "
5070                                                     "because the cgroup or one of its parents or "
5071                                                     "siblings is in the threaded mode: %m", p);
5072                 }
5073                 if (r < 0) {
5074                         *exit_status = EXIT_CGROUP;
5075                         return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
5076                 }
5077         }
5078
5079         if (context->network_namespace_path && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
5080                 r = open_shareable_ns_path(runtime->shared->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
5081                 if (r < 0) {
5082                         *exit_status = EXIT_NETWORK;
5083                         return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
5084                 }
5085         }
5086
5087         if (context->ipc_namespace_path && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
5088                 r = open_shareable_ns_path(runtime->shared->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
5089                 if (r < 0) {
5090                         *exit_status = EXIT_NAMESPACE;
5091                         return log_unit_error_errno(unit, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
5092                 }
5093         }
5094
5095         r = setup_input(context, params, socket_fd, named_iofds);
5096         if (r < 0) {
5097                 *exit_status = EXIT_STDIN;
5098                 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
5099         }
5100
5101         r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
5102         if (r < 0) {
5103                 *exit_status = EXIT_STDOUT;
5104                 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
5105         }
5106
5107         r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
5108         if (r < 0) {
5109                 *exit_status = EXIT_STDERR;
5110                 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
5111         }
5112
5113         if (context->oom_score_adjust_set) {
5114                 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
5115                  * prohibit write access to this file, and we shouldn't trip up over that. */
5116                 r = set_oom_score_adjust(context->oom_score_adjust);
5117                 if (ERRNO_IS_PRIVILEGE(r))
5118                         log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
5119                 else if (r < 0) {
5120                         *exit_status = EXIT_OOM_ADJUST;
5121                         return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
5122                 }
5123         }
5124
5125         if (context->coredump_filter_set) {
5126                 r = set_coredump_filter(context->coredump_filter);
5127                 if (ERRNO_IS_PRIVILEGE(r))
5128                         log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
5129                 else if (r < 0)
5130                         return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
5131         }
5132
5133         if (context->nice_set) {
5134                 r = setpriority_closest(context->nice);
5135                 if (r < 0)
5136                         return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
5137         }
5138
5139         if (context->cpu_sched_set) {
5140                 struct sched_param param = {
5141                         .sched_priority = context->cpu_sched_priority,
5142                 };
5143
5144                 r = sched_setscheduler(0,
5145                                        context->cpu_sched_policy |
5146                                        (context->cpu_sched_reset_on_fork ?
5147                                         SCHED_RESET_ON_FORK : 0),
5148                                        &param);
5149                 if (r < 0) {
5150                         *exit_status = EXIT_SETSCHEDULER;
5151                         return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
5152                 }
5153         }
5154
5155         if (context->cpu_affinity_from_numa || context->cpu_set.set) {
5156                 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
5157                 const CPUSet *cpu_set;
5158
5159                 if (context->cpu_affinity_from_numa) {
5160                         r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
5161                         if (r < 0) {
5162                                 *exit_status = EXIT_CPUAFFINITY;
5163                                 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
5164                         }
5165
5166                         cpu_set = &converted_cpu_set;
5167                 } else
5168                         cpu_set = &context->cpu_set;
5169
5170                 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
5171                         *exit_status = EXIT_CPUAFFINITY;
5172                         return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
5173                 }
5174         }
5175
5176         if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
5177                 r = apply_numa_policy(&context->numa_policy);
5178                 if (r < 0) {
5179                         if (ERRNO_IS_NOT_SUPPORTED(r))
5180                                 log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
5181                         else {
5182                                 *exit_status = EXIT_NUMA_POLICY;
5183                                 return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
5184                         }
5185                 }
5186         }
5187
5188         if (context->ioprio_set)
5189                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
5190                         *exit_status = EXIT_IOPRIO;
5191                         return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
5192                 }
5193
5194         if (context->timer_slack_nsec != NSEC_INFINITY)
5195                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
5196                         *exit_status = EXIT_TIMERSLACK;
5197                         return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
5198                 }
5199
5200         if (context->personality != PERSONALITY_INVALID) {
5201                 r = safe_personality(context->personality);
5202                 if (r < 0) {
5203                         *exit_status = EXIT_PERSONALITY;
5204                         return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
5205                 }
5206         }
5207
5208         if (context->utmp_id) {
5209                 const char *line = context->tty_path ?
5210                         (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
5211                         NULL;
5212                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
5213                                       line,
5214                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
5215                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
5216                                       USER_PROCESS,
5217                                       username);
5218         }
5219
5220         if (uid_is_valid(uid)) {
5221                 r = chown_terminal(STDIN_FILENO, uid);
5222                 if (r < 0) {
5223                         *exit_status = EXIT_STDIN;
5224                         return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
5225                 }
5226         }
5227
5228         if (params->cgroup_path) {
5229                 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
5230                  * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
5231                  * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
5232                  * touch a single hierarchy too. */
5233
5234                 if (params->flags & EXEC_CGROUP_DELEGATE) {
5235                         _cleanup_free_ char *p = NULL;
5236
5237                         r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
5238                         if (r < 0) {
5239                                 *exit_status = EXIT_CGROUP;
5240                                 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
5241                         }
5242
5243                         r = exec_parameters_get_cgroup_path(params, cgroup_context, &p);
5244                         if (r < 0) {
5245                                 *exit_status = EXIT_CGROUP;
5246                                 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
5247                         }
5248                         if (r > 0) {
5249                                 r = cg_set_access_recursive(SYSTEMD_CGROUP_CONTROLLER, p, uid, gid);
5250                                 if (r < 0) {
5251                                         *exit_status = EXIT_CGROUP;
5252                                         return log_unit_error_errno(unit, r, "Failed to adjust control subgroup access: %m");
5253                                 }
5254                         }
5255                 }
5256
5257                 if (cgroup_context && cg_unified() > 0 && is_pressure_supported() > 0) {
5258                         if (cgroup_context_want_memory_pressure(cgroup_context)) {
5259                                 r = cg_get_path("memory", params->cgroup_path, "memory.pressure", &memory_pressure_path);
5260                                 if (r < 0) {
5261                                         *exit_status = EXIT_MEMORY;
5262                                         return log_oom();
5263                                 }
5264
5265                                 r = chmod_and_chown(memory_pressure_path, 0644, uid, gid);
5266                                 if (r < 0) {
5267                                         log_unit_full_errno(unit, r == -ENOENT || ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
5268                                                             "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path);
5269                                         memory_pressure_path = mfree(memory_pressure_path);
5270                                 }
5271                         } else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_OFF) {
5272                                 memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
5273                                 if (!memory_pressure_path) {
5274                                         *exit_status = EXIT_MEMORY;
5275                                         return log_oom();
5276                                 }
5277                         }
5278                 }
5279         }
5280
5281         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
5282
5283         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
5284                 r = setup_exec_directory(unit, context, params, uid, gid, dt, needs_mount_namespace, exit_status);
5285                 if (r < 0)
5286                         return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
5287         }
5288
5289         if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
5290                 r = setup_credentials(context, params, unit->id, uid);
5291                 if (r < 0) {
5292                         *exit_status = EXIT_CREDENTIALS;
5293                         return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
5294                 }
5295         }
5296
5297         r = build_environment(
5298                         unit,
5299                         context,
5300                         params,
5301                         cgroup_context,
5302                         n_fds,
5303                         fdnames,
5304                         home,
5305                         username,
5306                         shell,
5307                         journal_stream_dev,
5308                         journal_stream_ino,
5309                         memory_pressure_path,
5310                         &our_env);
5311         if (r < 0) {
5312                 *exit_status = EXIT_MEMORY;
5313                 return log_oom();
5314         }
5315
5316         r = build_pass_environment(context, &pass_env);
5317         if (r < 0) {
5318                 *exit_status = EXIT_MEMORY;
5319                 return log_oom();
5320         }
5321
5322         /* The $PATH variable is set to the default path in params->environment. However, this is overridden
5323          * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
5324          * not specify PATH but the unit has ExecSearchPath. */
5325         if (!strv_isempty(context->exec_search_path)) {
5326                 _cleanup_free_ char *joined = NULL;
5327
5328                 joined = strv_join(context->exec_search_path, ":");
5329                 if (!joined) {
5330                         *exit_status = EXIT_MEMORY;
5331                         return log_oom();
5332                 }
5333
5334                 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
5335                 if (r < 0) {
5336                         *exit_status = EXIT_MEMORY;
5337                         return log_oom();
5338                 }
5339         }
5340
5341         accum_env = strv_env_merge(params->environment,
5342                                    our_env,
5343                                    joined_exec_search_path,
5344                                    pass_env,
5345                                    context->environment,
5346                                    files_env);
5347         if (!accum_env) {
5348                 *exit_status = EXIT_MEMORY;
5349                 return log_oom();
5350         }
5351         accum_env = strv_env_clean(accum_env);
5352
5353         (void) umask(context->umask);
5354
5355         r = setup_keyring(unit, context, params, uid, gid);
5356         if (r < 0) {
5357                 *exit_status = EXIT_KEYRING;
5358                 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
5359         }
5360
5361         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
5362          * from it. */
5363         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
5364
5365         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
5366          * for it, and the kernel doesn't actually support ambient caps. */
5367         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
5368
5369         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
5370          * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
5371          * desired. */
5372         if (needs_ambient_hack)
5373                 needs_setuid = false;
5374         else
5375                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
5376
5377         uint64_t capability_ambient_set = context->capability_ambient_set;
5378
5379         if (needs_sandboxing) {
5380                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
5381                  * /sys being present. The actual MAC context application will happen later, as late as
5382                  * possible, to avoid impacting our own code paths. */
5383
5384 #if HAVE_SELINUX
5385                 use_selinux = mac_selinux_use();
5386 #endif
5387 #if ENABLE_SMACK
5388                 use_smack = mac_smack_use();
5389 #endif
5390 #if HAVE_APPARMOR
5391                 use_apparmor = mac_apparmor_use();
5392 #endif
5393         }
5394
5395         if (needs_sandboxing) {
5396                 int which_failed;
5397
5398                 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
5399                  * is set here. (See below.) */
5400
5401                 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
5402                 if (r < 0) {
5403                         *exit_status = EXIT_LIMITS;
5404                         return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
5405                 }
5406         }
5407
5408         if (needs_setuid && context->pam_name && username) {
5409                 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
5410                  * wins here. (See above.) */
5411
5412                 /* All fds passed in the fds array will be closed in the pam child process. */
5413                 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
5414                 if (r < 0) {
5415                         *exit_status = EXIT_PAM;
5416                         return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
5417                 }
5418
5419                 if (ambient_capabilities_supported()) {
5420                         uint64_t ambient_after_pam;
5421
5422                         /* PAM modules might have set some ambient caps. Query them here and merge them into
5423                          * the caps we want to set in the end, so that we don't end up unsetting them. */
5424                         r = capability_get_ambient(&ambient_after_pam);
5425                         if (r < 0) {
5426                                 *exit_status = EXIT_CAPABILITIES;
5427                                 return log_unit_error_errno(unit, r, "Failed to query ambient caps: %m");
5428                         }
5429
5430                         capability_ambient_set |= ambient_after_pam;
5431                 }
5432
5433                 ngids_after_pam = getgroups_alloc(&gids_after_pam);
5434                 if (ngids_after_pam < 0) {
5435                         *exit_status = EXIT_MEMORY;
5436                         return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
5437                 }
5438         }
5439
5440         if (needs_sandboxing && exec_context_need_unprivileged_private_users(context, params)) {
5441                 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
5442                  * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
5443                  * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
5444
5445                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
5446                 /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
5447                  * the actual requested operations fail (or silently continue). */
5448                 if (r < 0 && context->private_users) {
5449                         *exit_status = EXIT_USER;
5450                         return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
5451                 }
5452                 if (r < 0)
5453                         log_unit_info_errno(unit, r, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
5454                 else
5455                         userns_set_up = true;
5456         }
5457
5458         if (exec_needs_network_namespace(context) && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
5459
5460                 /* Try to enable network namespacing if network namespacing is available and we have
5461                  * CAP_NET_ADMIN. We need CAP_NET_ADMIN to be able to configure the loopback device in the
5462                  * new network namespace. And if we don't have that, then we could only create a network
5463                  * namespace without the ability to set up "lo". Hence gracefully skip things then. */
5464                 if (ns_type_supported(NAMESPACE_NET) && have_effective_cap(CAP_NET_ADMIN) > 0) {
5465                         r = setup_shareable_ns(runtime->shared->netns_storage_socket, CLONE_NEWNET);
5466                         if (r < 0) {
5467                                 if (ERRNO_IS_PRIVILEGE(r))
5468                                         log_unit_notice_errno(unit, r,
5469                                                                "PrivateNetwork=yes is configured, but network namespace setup not permitted, proceeding without: %m");
5470                                 else {
5471                                         *exit_status = EXIT_NETWORK;
5472                                         return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
5473                                 }
5474                         }
5475                 } else if (context->network_namespace_path) {
5476                         *exit_status = EXIT_NETWORK;
5477                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
5478                                                     "NetworkNamespacePath= is not supported, refusing.");
5479                 } else
5480                         log_unit_notice(unit, "PrivateNetwork=yes is configured, but the kernel does not support or we lack privileges for network namespace, proceeding without.");
5481         }
5482
5483         if (exec_needs_ipc_namespace(context) && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
5484
5485                 if (ns_type_supported(NAMESPACE_IPC)) {
5486                         r = setup_shareable_ns(runtime->shared->ipcns_storage_socket, CLONE_NEWIPC);
5487                         if (r == -EPERM)
5488                                 log_unit_warning_errno(unit, r,
5489                                                        "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
5490                         else if (r < 0) {
5491                                 *exit_status = EXIT_NAMESPACE;
5492                                 return log_unit_error_errno(unit, r, "Failed to set up IPC namespacing: %m");
5493                         }
5494                 } else if (context->ipc_namespace_path) {
5495                         *exit_status = EXIT_NAMESPACE;
5496                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
5497                                                     "IPCNamespacePath= is not supported, refusing.");
5498                 } else
5499                         log_unit_warning(unit, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
5500         }
5501
5502         if (needs_mount_namespace) {
5503                 _cleanup_free_ char *error_path = NULL;
5504
5505                 r = apply_mount_namespace(unit, command->flags, context, params, runtime, memory_pressure_path, &error_path);
5506                 if (r < 0) {
5507                         *exit_status = EXIT_NAMESPACE;
5508                         return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
5509                                                     error_path ? ": " : "", strempty(error_path));
5510                 }
5511         }
5512
5513         if (needs_sandboxing) {
5514                 r = apply_protect_hostname(unit, context, exit_status);
5515                 if (r < 0)
5516                         return r;
5517         }
5518
5519         if (context->memory_ksm >= 0)
5520                 if (prctl(PR_SET_MEMORY_MERGE, context->memory_ksm) < 0) {
5521                         if (ERRNO_IS_NOT_SUPPORTED(errno))
5522                                 log_unit_debug_errno(unit, errno, "KSM support not available, ignoring.");
5523                         else {
5524                                 *exit_status = EXIT_KSM;
5525                                 return log_unit_error_errno(unit, errno, "Failed to set KSM: %m");
5526                         }
5527                 }
5528
5529         /* Drop groups as early as possible.
5530          * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
5531          * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
5532         if (needs_setuid) {
5533                 _cleanup_free_ gid_t *gids_to_enforce = NULL;
5534                 int ngids_to_enforce = 0;
5535
5536                 ngids_to_enforce = merge_gid_lists(supplementary_gids,
5537                                                    ngids,
5538                                                    gids_after_pam,
5539                                                    ngids_after_pam,
5540                                                    &gids_to_enforce);
5541                 if (ngids_to_enforce < 0) {
5542                         *exit_status = EXIT_MEMORY;
5543                         return log_unit_error_errno(unit,
5544                                                     ngids_to_enforce,
5545                                                     "Failed to merge group lists. Group membership might be incorrect: %m");
5546                 }
5547
5548                 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
5549                 if (r < 0) {
5550                         *exit_status = EXIT_GROUP;
5551                         return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
5552                 }
5553         }
5554
5555         /* If the user namespace was not set up above, try to do it now.
5556          * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
5557          * restricted by rules pertaining to combining user namespaces with other namespaces (e.g. in the
5558          * case of mount namespaces being less privileged when the mount point list is copied from a
5559          * different user namespace). */
5560
5561         if (needs_sandboxing && context->private_users && !userns_set_up) {
5562                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
5563                 if (r < 0) {
5564                         *exit_status = EXIT_USER;
5565                         return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
5566                 }
5567         }
5568
5569         /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
5570          * shall execute. */
5571
5572         _cleanup_free_ char *executable = NULL;
5573         _cleanup_close_ int executable_fd = -EBADF;
5574         r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
5575         if (r < 0) {
5576                 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
5577                         log_unit_struct_errno(unit, LOG_INFO, r,
5578                                               "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5579                                               LOG_UNIT_INVOCATION_ID(unit),
5580                                               LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
5581                                                                command->path),
5582                                               "EXECUTABLE=%s", command->path);
5583                         return 0;
5584                 }
5585
5586                 *exit_status = EXIT_EXEC;
5587
5588                 return log_unit_struct_errno(unit, LOG_INFO, r,
5589                                              "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5590                                              LOG_UNIT_INVOCATION_ID(unit),
5591                                              LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
5592                                                               command->path),
5593                                              "EXECUTABLE=%s", command->path);
5594         }
5595
5596         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
5597         if (r < 0) {
5598                 *exit_status = EXIT_FDS;
5599                 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
5600         }
5601
5602 #if HAVE_SELINUX
5603         if (needs_sandboxing && use_selinux && params->selinux_context_net) {
5604                 int fd = -EBADF;
5605
5606                 if (socket_fd >= 0)
5607                         fd = socket_fd;
5608                 else if (params->n_socket_fds == 1)
5609                         /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
5610                          * use context from that fd to compute the label. */
5611                         fd = params->fds[0];
5612
5613                 if (fd >= 0) {
5614                         r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
5615                         if (r < 0) {
5616                                 if (!context->selinux_context_ignore) {
5617                                         *exit_status = EXIT_SELINUX_CONTEXT;
5618                                         return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
5619                                 }
5620                                 log_unit_debug_errno(unit, r, "Failed to determine SELinux context, ignoring: %m");
5621                         }
5622                 }
5623         }
5624 #endif
5625
5626         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that
5627          * we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any
5628          * more. We do keep exec_fd however, if we have it, since we need to keep it open until the final
5629          * execve(). */
5630
5631         r = close_all_fds(keep_fds, n_keep_fds);
5632         if (r >= 0)
5633                 r = shift_fds(fds, n_fds);
5634         if (r >= 0)
5635                 r = flags_fds(fds, n_socket_fds, n_fds, context->non_blocking);
5636         if (r < 0) {
5637                 *exit_status = EXIT_FDS;
5638                 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
5639         }
5640
5641         /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
5642          * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
5643          * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
5644          * came this far. */
5645
5646         secure_bits = context->secure_bits;
5647
5648         if (needs_sandboxing) {
5649                 uint64_t bset;
5650
5651                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested.
5652                  * (Note this is placed after the general resource limit initialization, see above, in order
5653                  * to take precedence.) */
5654                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
5655                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
5656                                 *exit_status = EXIT_LIMITS;
5657                                 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
5658                         }
5659                 }
5660
5661 #if ENABLE_SMACK
5662                 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
5663                  * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
5664                 if (use_smack) {
5665                         r = setup_smack(unit->manager, context, executable_fd);
5666                         if (r < 0 && !context->smack_process_label_ignore) {
5667                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
5668                                 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
5669                         }
5670                 }
5671 #endif
5672
5673                 bset = context->capability_bounding_set;
5674                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
5675                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
5676                  * instead of us doing that */
5677                 if (needs_ambient_hack)
5678                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
5679                                 (UINT64_C(1) << CAP_SETUID) |
5680                                 (UINT64_C(1) << CAP_SETGID);
5681
5682                 if (!cap_test_all(bset)) {
5683                         r = capability_bounding_set_drop(bset, /* right_now= */ false);
5684                         if (r < 0) {
5685                                 *exit_status = EXIT_CAPABILITIES;
5686                                 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
5687                         }
5688                 }
5689
5690                 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
5691                  * keep-caps set.
5692                  *
5693                  * To be able to raise the ambient capabilities after setresuid() they have to be added to
5694                  * the inherited set and keep caps has to be set (done in enforce_user()).  After setresuid()
5695                  * the ambient capabilities can be raised as they are present in the permitted and
5696                  * inhertiable set. However it is possible that someone wants to set ambient capabilities
5697                  * without changing the user, so we also set the ambient capabilities here.
5698                  *
5699                  * The requested ambient capabilities are raised in the inheritable set if the second
5700                  * argument is true. */
5701                 if (!needs_ambient_hack) {
5702                         r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ true);
5703                         if (r < 0) {
5704                                 *exit_status = EXIT_CAPABILITIES;
5705                                 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
5706                         }
5707                 }
5708         }
5709
5710         /* chroot to root directory first, before we lose the ability to chroot */
5711         r = apply_root_directory(context, params, runtime, needs_mount_namespace, exit_status);
5712         if (r < 0)
5713                 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
5714
5715         if (needs_setuid) {
5716                 if (uid_is_valid(uid)) {
5717                         r = enforce_user(context, uid, capability_ambient_set);
5718                         if (r < 0) {
5719                                 *exit_status = EXIT_USER;
5720                                 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
5721                         }
5722
5723                         if (!needs_ambient_hack && capability_ambient_set != 0) {
5724
5725                                 /* Raise the ambient capabilities after user change. */
5726                                 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ false);
5727                                 if (r < 0) {
5728                                         *exit_status = EXIT_CAPABILITIES;
5729                                         return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
5730                                 }
5731                         }
5732                 }
5733         }
5734
5735         /* Apply working directory here, because the working directory might be on NFS and only the user running
5736          * this service might have the correct privilege to change to the working directory */
5737         r = apply_working_directory(context, params, runtime, home, exit_status);
5738         if (r < 0)
5739                 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
5740
5741         if (needs_sandboxing) {
5742                 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
5743                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
5744                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
5745                  * are restricted. */
5746
5747 #if HAVE_SELINUX
5748                 if (use_selinux) {
5749                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
5750
5751                         if (exec_context) {
5752                                 r = setexeccon(exec_context);
5753                                 if (r < 0) {
5754                                         if (!context->selinux_context_ignore) {
5755                                                 *exit_status = EXIT_SELINUX_CONTEXT;
5756                                                 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
5757                                         }
5758                                         log_unit_debug_errno(unit, r, "Failed to change SELinux context to %s, ignoring: %m", exec_context);
5759                                 }
5760                         }
5761                 }
5762 #endif
5763
5764 #if HAVE_APPARMOR
5765                 if (use_apparmor && context->apparmor_profile) {
5766                         r = aa_change_onexec(context->apparmor_profile);
5767                         if (r < 0 && !context->apparmor_profile_ignore) {
5768                                 *exit_status = EXIT_APPARMOR_PROFILE;
5769                                 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
5770                         }
5771                 }
5772 #endif
5773
5774                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential
5775                  * EPERMs we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits
5776                  * requires CAP_SETPCAP. */
5777                 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
5778                         /* CAP_SETPCAP is required to set securebits. This capability is raised into the
5779                          * effective set here.
5780                          *
5781                          * The effective set is overwritten during execve() with the following values:
5782                          *
5783                          * - ambient set (for non-root processes)
5784                          *
5785                          * - (inheritable | bounding) set for root processes)
5786                          *
5787                          * Hence there is no security impact to raise it in the effective set before execve
5788                          */
5789                         r = capability_gain_cap_setpcap(/* return_caps= */ NULL);
5790                         if (r < 0) {
5791                                 *exit_status = EXIT_CAPABILITIES;
5792                                 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
5793                         }
5794                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
5795                                 *exit_status = EXIT_SECUREBITS;
5796                                 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
5797                         }
5798                 }
5799
5800                 if (context_has_no_new_privileges(context))
5801                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
5802                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
5803                                 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
5804                         }
5805
5806 #if HAVE_SECCOMP
5807                 r = apply_address_families(unit, context);
5808                 if (r < 0) {
5809                         *exit_status = EXIT_ADDRESS_FAMILIES;
5810                         return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
5811                 }
5812
5813                 r = apply_memory_deny_write_execute(unit, context);
5814                 if (r < 0) {
5815                         *exit_status = EXIT_SECCOMP;
5816                         return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
5817                 }
5818
5819                 r = apply_restrict_realtime(unit, context);
5820                 if (r < 0) {
5821                         *exit_status = EXIT_SECCOMP;
5822                         return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
5823                 }
5824
5825                 r = apply_restrict_suid_sgid(unit, context);
5826                 if (r < 0) {
5827                         *exit_status = EXIT_SECCOMP;
5828                         return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
5829                 }
5830
5831                 r = apply_restrict_namespaces(unit, context);
5832                 if (r < 0) {
5833                         *exit_status = EXIT_SECCOMP;
5834                         return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
5835                 }
5836
5837                 r = apply_protect_sysctl(unit, context);
5838                 if (r < 0) {
5839                         *exit_status = EXIT_SECCOMP;
5840                         return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
5841                 }
5842
5843                 r = apply_protect_kernel_modules(unit, context);
5844                 if (r < 0) {
5845                         *exit_status = EXIT_SECCOMP;
5846                         return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
5847                 }
5848
5849                 r = apply_protect_kernel_logs(unit, context);
5850                 if (r < 0) {
5851                         *exit_status = EXIT_SECCOMP;
5852                         return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
5853                 }
5854
5855                 r = apply_protect_clock(unit, context);
5856                 if (r < 0) {
5857                         *exit_status = EXIT_SECCOMP;
5858                         return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
5859                 }
5860
5861                 r = apply_private_devices(unit, context);
5862                 if (r < 0) {
5863                         *exit_status = EXIT_SECCOMP;
5864                         return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
5865                 }
5866
5867                 r = apply_syscall_archs(unit, context);
5868                 if (r < 0) {
5869                         *exit_status = EXIT_SECCOMP;
5870                         return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
5871                 }
5872
5873                 r = apply_lock_personality(unit, context);
5874                 if (r < 0) {
5875                         *exit_status = EXIT_SECCOMP;
5876                         return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
5877                 }
5878
5879                 r = apply_syscall_log(unit, context);
5880                 if (r < 0) {
5881                         *exit_status = EXIT_SECCOMP;
5882                         return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
5883                 }
5884
5885                 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
5886                  * by the filter as little as possible. */
5887                 r = apply_syscall_filter(unit, context, needs_ambient_hack);
5888                 if (r < 0) {
5889                         *exit_status = EXIT_SECCOMP;
5890                         return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
5891                 }
5892 #endif
5893
5894 #if HAVE_LIBBPF
5895                 r = apply_restrict_filesystems(unit, context);
5896                 if (r < 0) {
5897                         *exit_status = EXIT_BPF;
5898                         return log_unit_error_errno(unit, r, "Failed to restrict filesystems: %m");
5899                 }
5900 #endif
5901
5902         }
5903
5904         if (!strv_isempty(context->unset_environment)) {
5905                 char **ee = NULL;
5906
5907                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
5908                 if (!ee) {
5909                         *exit_status = EXIT_MEMORY;
5910                         return log_oom();
5911                 }
5912
5913                 strv_free_and_replace(accum_env, ee);
5914         }
5915
5916         if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
5917                 _cleanup_strv_free_ char **unset_variables = NULL, **bad_variables = NULL;
5918
5919                 r = replace_env_argv(command->argv, accum_env, &replaced_argv, &unset_variables, &bad_variables);
5920                 if (r < 0) {
5921                         *exit_status = EXIT_MEMORY;
5922                         return log_unit_error_errno(unit, r, "Failed to replace environment variables: %m");
5923                 }
5924                 final_argv = replaced_argv;
5925
5926                 if (!strv_isempty(unset_variables)) {
5927                         _cleanup_free_ char *ju = strv_join(unset_variables, ", ");
5928                         log_unit_warning(unit, "Referenced but unset environment variable evaluates to an empty string: %s", strna(ju));
5929                 }
5930
5931                 if (!strv_isempty(bad_variables)) {
5932                         _cleanup_free_ char *jb = strv_join(bad_variables, ", ");
5933                         log_unit_warning(unit, "Invalid environment variable name evaluates to an empty string: %s", strna(jb));;
5934                 }
5935         } else
5936                 final_argv = command->argv;
5937
5938         log_command_line(unit, "Executing", executable, final_argv);
5939
5940         if (exec_fd >= 0) {
5941                 uint8_t hot = 1;
5942
5943                 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5944                  * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5945
5946                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5947                         *exit_status = EXIT_EXEC;
5948                         return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
5949                 }
5950         }
5951
5952         r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5953
5954         if (exec_fd >= 0) {
5955                 uint8_t hot = 0;
5956
5957                 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5958                  * that POLLHUP on it no longer means execve() succeeded. */
5959
5960                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5961                         *exit_status = EXIT_EXEC;
5962                         return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
5963                 }
5964         }
5965
5966         *exit_status = EXIT_EXEC;
5967         return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
5968 }
5969
5970 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
5971 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
5972
5973 int exec_spawn(Unit *unit,
5974                ExecCommand *command,
5975                const ExecContext *context,
5976                const ExecParameters *params,
5977                ExecRuntime *runtime,
5978                const CGroupContext *cgroup_context,
5979                pid_t *ret) {
5980
5981         int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
5982         _cleanup_free_ char *subcgroup_path = NULL;
5983         _cleanup_strv_free_ char **files_env = NULL;
5984         size_t n_storage_fds = 0, n_socket_fds = 0;
5985         pid_t pid;
5986
5987         assert(unit);
5988         assert(command);
5989         assert(context);
5990         assert(ret);
5991         assert(params);
5992         assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
5993
5994         LOG_CONTEXT_PUSH_UNIT(unit);
5995
5996         if (context->std_input == EXEC_INPUT_SOCKET ||
5997             context->std_output == EXEC_OUTPUT_SOCKET ||
5998             context->std_error == EXEC_OUTPUT_SOCKET) {
5999
6000                 if (params->n_socket_fds > 1)
6001                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
6002
6003                 if (params->n_socket_fds == 0)
6004                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
6005
6006                 socket_fd = params->fds[0];
6007         } else {
6008                 socket_fd = -EBADF;
6009                 fds = params->fds;
6010                 n_socket_fds = params->n_socket_fds;
6011                 n_storage_fds = params->n_storage_fds;
6012         }
6013
6014         r = exec_context_named_iofds(context, params, named_iofds);
6015         if (r < 0)
6016                 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
6017
6018         r = exec_context_load_environment(unit, context, &files_env);
6019         if (r < 0)
6020                 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
6021
6022         /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
6023            and, until the next SELinux policy changes, we save further reloads in future children. */
6024         mac_selinux_maybe_reload();
6025
6026         /* We won't know the real executable path until we create the mount namespace in the child, but we
6027            want to log from the parent, so we use the possibly inaccurate path here. */
6028         log_command_line(unit, "About to execute", command->path, command->argv);
6029
6030         if (params->cgroup_path) {
6031                 r = exec_parameters_get_cgroup_path(params, cgroup_context, &subcgroup_path);
6032                 if (r < 0)
6033                         return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
6034                 if (r > 0) {
6035                         /* If there's a subcgroup, then let's create it here now (the main cgroup was already
6036                          * realized by the unit logic) */
6037
6038                         r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
6039                         if (r < 0)
6040                                 return log_unit_error_errno(unit, r, "Failed to create subcgroup '%s': %m", subcgroup_path);
6041                 }
6042         }
6043
6044         pid = fork();
6045         if (pid < 0)
6046                 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
6047
6048         if (pid == 0) {
6049                 int exit_status = EXIT_SUCCESS;
6050
6051                 r = exec_child(unit,
6052                                command,
6053                                context,
6054                                params,
6055                                runtime,
6056                                cgroup_context,
6057                                socket_fd,
6058                                named_iofds,
6059                                fds,
6060                                n_socket_fds,
6061                                n_storage_fds,
6062                                files_env,
6063                                unit->manager->user_lookup_fds[1],
6064                                &exit_status);
6065
6066                 if (r < 0) {
6067                         const char *status =
6068                                 exit_status_to_string(exit_status,
6069                                                       EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
6070
6071                         log_unit_struct_errno(unit, LOG_ERR, r,
6072                                               "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
6073                                               LOG_UNIT_INVOCATION_ID(unit),
6074                                               LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
6075                                                                status, command->path),
6076                                               "EXECUTABLE=%s", command->path);
6077                 }
6078
6079                 _exit(exit_status);
6080         }
6081
6082         log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
6083
6084         /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
6085          * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
6086          * process will be killed too). */
6087         if (subcgroup_path)
6088                 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
6089
6090         exec_status_start(&command->exec_status, pid);
6091
6092         *ret = pid;
6093         return 0;
6094 }
6095
6096 void exec_context_init(ExecContext *c) {
6097         assert(c);
6098
6099         c->umask = 0022;
6100         c->ioprio = IOPRIO_DEFAULT_CLASS_AND_PRIO;
6101         c->cpu_sched_policy = SCHED_OTHER;
6102         c->syslog_priority = LOG_DAEMON|LOG_INFO;
6103         c->syslog_level_prefix = true;
6104         c->ignore_sigpipe = true;
6105         c->timer_slack_nsec = NSEC_INFINITY;
6106         c->personality = PERSONALITY_INVALID;
6107         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
6108                 c->directories[t].mode = 0755;
6109         c->timeout_clean_usec = USEC_INFINITY;
6110         c->capability_bounding_set = CAP_MASK_UNSET;
6111         assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
6112         c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
6113         c->log_level_max = -1;
6114 #if HAVE_SECCOMP
6115         c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL;
6116 #endif
6117         c->tty_rows = UINT_MAX;
6118         c->tty_cols = UINT_MAX;
6119         numa_policy_reset(&c->numa_policy);
6120         c->private_mounts = -1;
6121         c->memory_ksm = -1;
6122 }
6123
6124 void exec_context_done(ExecContext *c) {
6125         assert(c);
6126
6127         c->environment = strv_free(c->environment);
6128         c->environment_files = strv_free(c->environment_files);
6129         c->pass_environment = strv_free(c->pass_environment);
6130         c->unset_environment = strv_free(c->unset_environment);
6131
6132         rlimit_free_all(c->rlimit);
6133
6134         for (size_t l = 0; l < 3; l++) {
6135                 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
6136                 c->stdio_file[l] = mfree(c->stdio_file[l]);
6137         }
6138
6139         c->working_directory = mfree(c->working_directory);
6140         c->root_directory = mfree(c->root_directory);
6141         c->root_image = mfree(c->root_image);
6142         c->root_image_options = mount_options_free_all(c->root_image_options);
6143         c->root_hash = mfree(c->root_hash);
6144         c->root_hash_size = 0;
6145         c->root_hash_path = mfree(c->root_hash_path);
6146         c->root_hash_sig = mfree(c->root_hash_sig);
6147         c->root_hash_sig_size = 0;
6148         c->root_hash_sig_path = mfree(c->root_hash_sig_path);
6149         c->root_verity = mfree(c->root_verity);
6150         c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
6151         c->extension_directories = strv_free(c->extension_directories);
6152         c->tty_path = mfree(c->tty_path);
6153         c->syslog_identifier = mfree(c->syslog_identifier);
6154         c->user = mfree(c->user);
6155         c->group = mfree(c->group);
6156
6157         c->supplementary_groups = strv_free(c->supplementary_groups);
6158
6159         c->pam_name = mfree(c->pam_name);
6160
6161         c->read_only_paths = strv_free(c->read_only_paths);
6162         c->read_write_paths = strv_free(c->read_write_paths);
6163         c->inaccessible_paths = strv_free(c->inaccessible_paths);
6164         c->exec_paths = strv_free(c->exec_paths);
6165         c->no_exec_paths = strv_free(c->no_exec_paths);
6166         c->exec_search_path = strv_free(c->exec_search_path);
6167
6168         bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
6169         c->bind_mounts = NULL;
6170         c->n_bind_mounts = 0;
6171         temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
6172         c->temporary_filesystems = NULL;
6173         c->n_temporary_filesystems = 0;
6174         c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
6175
6176         cpu_set_reset(&c->cpu_set);
6177         numa_policy_reset(&c->numa_policy);
6178
6179         c->utmp_id = mfree(c->utmp_id);
6180         c->selinux_context = mfree(c->selinux_context);
6181         c->apparmor_profile = mfree(c->apparmor_profile);
6182         c->smack_process_label = mfree(c->smack_process_label);
6183
6184         c->restrict_filesystems = set_free(c->restrict_filesystems);
6185
6186         c->syscall_filter = hashmap_free(c->syscall_filter);
6187         c->syscall_archs = set_free(c->syscall_archs);
6188         c->address_families = set_free(c->address_families);
6189
6190         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
6191                 exec_directory_done(&c->directories[t]);
6192
6193         c->log_level_max = -1;
6194
6195         exec_context_free_log_extra_fields(c);
6196         c->log_filter_allowed_patterns = set_free(c->log_filter_allowed_patterns);
6197         c->log_filter_denied_patterns = set_free(c->log_filter_denied_patterns);
6198
6199         c->log_ratelimit_interval_usec = 0;
6200         c->log_ratelimit_burst = 0;
6201
6202         c->stdin_data = mfree(c->stdin_data);
6203         c->stdin_data_size = 0;
6204
6205         c->network_namespace_path = mfree(c->network_namespace_path);
6206         c->ipc_namespace_path = mfree(c->ipc_namespace_path);
6207
6208         c->log_namespace = mfree(c->log_namespace);
6209
6210         c->load_credentials = hashmap_free(c->load_credentials);
6211         c->set_credentials = hashmap_free(c->set_credentials);
6212         c->import_credentials = set_free(c->import_credentials);
6213
6214         c->root_image_policy = image_policy_free(c->root_image_policy);
6215         c->mount_image_policy = image_policy_free(c->mount_image_policy);
6216         c->extension_image_policy = image_policy_free(c->extension_image_policy);
6217 }
6218
6219 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
6220         assert(c);
6221
6222         if (!runtime_prefix)
6223                 return 0;
6224
6225         for (size_t i = 0; i < c->directories[EXEC_DIRECTORY_RUNTIME].n_items; i++) {
6226                 _cleanup_free_ char *p = NULL;
6227
6228                 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
6229                         p = path_join(runtime_prefix, "private", c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
6230                 else
6231                         p = path_join(runtime_prefix, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
6232                 if (!p)
6233                         return -ENOMEM;
6234
6235                 /* We execute this synchronously, since we need to be sure this is gone when we start the
6236                  * service next. */
6237                 (void) rm_rf(p, REMOVE_ROOT);
6238
6239                 STRV_FOREACH(symlink, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].symlinks) {
6240                         _cleanup_free_ char *symlink_abs = NULL;
6241
6242                         if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
6243                                 symlink_abs = path_join(runtime_prefix, "private", *symlink);
6244                         else
6245                                 symlink_abs = path_join(runtime_prefix, *symlink);
6246                         if (!symlink_abs)
6247                                 return -ENOMEM;
6248
6249                         (void) unlink(symlink_abs);
6250                 }
6251         }
6252
6253         return 0;
6254 }
6255
6256 int exec_context_destroy_credentials(const ExecContext *c, const char *runtime_prefix, const char *unit) {
6257         _cleanup_free_ char *p = NULL;
6258
6259         assert(c);
6260
6261         if (!runtime_prefix || !unit)
6262                 return 0;
6263
6264         p = path_join(runtime_prefix, "credentials", unit);
6265         if (!p)
6266                 return -ENOMEM;
6267
6268         /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
6269          * unmount it, and afterwards remove the mount point */
6270         (void) umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW);
6271         (void) rm_rf(p, REMOVE_ROOT|REMOVE_CHMOD);
6272
6273         return 0;
6274 }
6275
6276 int exec_context_destroy_mount_ns_dir(Unit *u) {
6277         _cleanup_free_ char *p = NULL;
6278
6279         if (!u || !MANAGER_IS_SYSTEM(u->manager))
6280                 return 0;
6281
6282         p = path_join("/run/systemd/propagate/", u->id);
6283         if (!p)
6284                 return -ENOMEM;
6285
6286         /* This is only filled transiently (see mount_in_namespace()), should be empty or even non-existent*/
6287         if (rmdir(p) < 0 && errno != ENOENT)
6288                 log_unit_debug_errno(u, errno, "Unable to remove propagation dir '%s', ignoring: %m", p);
6289
6290         return 0;
6291 }
6292
6293 static void exec_command_done(ExecCommand *c) {
6294         assert(c);
6295
6296         c->path = mfree(c->path);
6297         c->argv = strv_free(c->argv);
6298 }
6299
6300 void exec_command_done_array(ExecCommand *c, size_t n) {
6301         for (size_t i = 0; i < n; i++)
6302                 exec_command_done(c+i);
6303 }
6304
6305 ExecCommand* exec_command_free_list(ExecCommand *c) {
6306         ExecCommand *i;
6307
6308         while ((i = c)) {
6309                 LIST_REMOVE(command, c, i);
6310                 exec_command_done(i);
6311                 free(i);
6312         }
6313
6314         return NULL;
6315 }
6316
6317 void exec_command_free_array(ExecCommand **c, size_t n) {
6318         for (size_t i = 0; i < n; i++)
6319                 c[i] = exec_command_free_list(c[i]);
6320 }
6321
6322 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
6323         for (size_t i = 0; i < n; i++)
6324                 exec_status_reset(&c[i].exec_status);
6325 }
6326
6327 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
6328         for (size_t i = 0; i < n; i++)
6329                 LIST_FOREACH(command, z, c[i])
6330                         exec_status_reset(&z->exec_status);
6331 }
6332
6333 typedef struct InvalidEnvInfo {
6334         const Unit *unit;
6335         const char *path;
6336 } InvalidEnvInfo;
6337
6338 static void invalid_env(const char *p, void *userdata) {
6339         InvalidEnvInfo *info = userdata;
6340
6341         log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
6342 }
6343
6344 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
6345         assert(c);
6346
6347         switch (fd_index) {
6348
6349         case STDIN_FILENO:
6350                 if (c->std_input != EXEC_INPUT_NAMED_FD)
6351                         return NULL;
6352
6353                 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
6354
6355         case STDOUT_FILENO:
6356                 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
6357                         return NULL;
6358
6359                 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
6360
6361         case STDERR_FILENO:
6362                 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
6363                         return NULL;
6364
6365                 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
6366
6367         default:
6368                 return NULL;
6369         }
6370 }
6371
6372 static int exec_context_named_iofds(
6373                 const ExecContext *c,
6374                 const ExecParameters *p,
6375                 int named_iofds[static 3]) {
6376
6377         size_t targets;
6378         const char* stdio_fdname[3];
6379         size_t n_fds;
6380
6381         assert(c);
6382         assert(p);
6383         assert(named_iofds);
6384
6385         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
6386                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
6387                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
6388
6389         for (size_t i = 0; i < 3; i++)
6390                 stdio_fdname[i] = exec_context_fdname(c, i);
6391
6392         n_fds = p->n_storage_fds + p->n_socket_fds;
6393
6394         for (size_t i = 0; i < n_fds  && targets > 0; i++)
6395                 if (named_iofds[STDIN_FILENO] < 0 &&
6396                     c->std_input == EXEC_INPUT_NAMED_FD &&
6397                     stdio_fdname[STDIN_FILENO] &&
6398                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
6399
6400                         named_iofds[STDIN_FILENO] = p->fds[i];
6401                         targets--;
6402
6403                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
6404                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
6405                            stdio_fdname[STDOUT_FILENO] &&
6406                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
6407
6408                         named_iofds[STDOUT_FILENO] = p->fds[i];
6409                         targets--;
6410
6411                 } else if (named_iofds[STDERR_FILENO] < 0 &&
6412                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
6413                            stdio_fdname[STDERR_FILENO] &&
6414                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
6415
6416                         named_iofds[STDERR_FILENO] = p->fds[i];
6417                         targets--;
6418                 }
6419
6420         return targets == 0 ? 0 : -ENOENT;
6421 }
6422
6423 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***ret) {
6424         _cleanup_strv_free_ char **v = NULL;
6425         int r;
6426
6427         assert(c);
6428         assert(ret);
6429
6430         STRV_FOREACH(i, c->environment_files) {
6431                 _cleanup_globfree_ glob_t pglob = {};
6432                 bool ignore = false;
6433                 char *fn = *i;
6434
6435                 if (fn[0] == '-') {
6436                         ignore = true;
6437                         fn++;
6438                 }
6439
6440                 if (!path_is_absolute(fn)) {
6441                         if (ignore)
6442                                 continue;
6443                         return -EINVAL;
6444                 }
6445
6446                 /* Filename supports globbing, take all matching files */
6447                 r = safe_glob(fn, 0, &pglob);
6448                 if (r < 0) {
6449                         if (ignore)
6450                                 continue;
6451                         return r;
6452                 }
6453
6454                 /* When we don't match anything, -ENOENT should be returned */
6455                 assert(pglob.gl_pathc > 0);
6456
6457                 for (size_t n = 0; n < pglob.gl_pathc; n++) {
6458                         _cleanup_strv_free_ char **p = NULL;
6459
6460                         r = load_env_file(NULL, pglob.gl_pathv[n], &p);
6461                         if (r < 0) {
6462                                 if (ignore)
6463                                         continue;
6464                                 return r;
6465                         }
6466
6467                         /* Log invalid environment variables with filename */
6468                         if (p) {
6469                                 InvalidEnvInfo info = {
6470                                         .unit = unit,
6471                                         .path = pglob.gl_pathv[n]
6472                                 };
6473
6474                                 p = strv_env_clean_with_callback(p, invalid_env, &info);
6475                         }
6476
6477                         if (!v)
6478                                 v = TAKE_PTR(p);
6479                         else {
6480                                 char **m = strv_env_merge(v, p);
6481                                 if (!m)
6482                                         return -ENOMEM;
6483
6484                                 strv_free_and_replace(v, m);
6485                         }
6486                 }
6487         }
6488
6489         *ret = TAKE_PTR(v);
6490
6491         return 0;
6492 }
6493
6494 static bool tty_may_match_dev_console(const char *tty) {
6495         _cleanup_free_ char *resolved = NULL;
6496
6497         if (!tty)
6498                 return true;
6499
6500         tty = skip_dev_prefix(tty);
6501
6502         /* trivial identity? */
6503         if (streq(tty, "console"))
6504                 return true;
6505
6506         if (resolve_dev_console(&resolved) < 0)
6507                 return true; /* if we could not resolve, assume it may */
6508
6509         /* "tty0" means the active VC, so it may be the same sometimes */
6510         return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
6511 }
6512
6513 static bool exec_context_may_touch_tty(const ExecContext *ec) {
6514         assert(ec);
6515
6516         return ec->tty_reset ||
6517                 ec->tty_vhangup ||
6518                 ec->tty_vt_disallocate ||
6519                 is_terminal_input(ec->std_input) ||
6520                 is_terminal_output(ec->std_output) ||
6521                 is_terminal_output(ec->std_error);
6522 }
6523
6524 bool exec_context_may_touch_console(const ExecContext *ec) {
6525
6526         return exec_context_may_touch_tty(ec) &&
6527                tty_may_match_dev_console(exec_context_tty_path(ec));
6528 }
6529
6530 static void strv_fprintf(FILE *f, char **l) {
6531         assert(f);
6532
6533         STRV_FOREACH(g, l)
6534                 fprintf(f, " %s", *g);
6535 }
6536
6537 static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
6538         assert(f);
6539         assert(prefix);
6540         assert(name);
6541
6542         if (!strv_isempty(strv)) {
6543                 fprintf(f, "%s%s:", prefix, name);
6544                 strv_fprintf(f, strv);
6545                 fputs("\n", f);
6546         }
6547 }
6548
6549 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
6550         int r;
6551
6552         assert(c);
6553         assert(f);
6554
6555         prefix = strempty(prefix);
6556
6557         fprintf(f,
6558                 "%sUMask: %04o\n"
6559                 "%sWorkingDirectory: %s\n"
6560                 "%sRootDirectory: %s\n"
6561                 "%sRootEphemeral: %s\n"
6562                 "%sNonBlocking: %s\n"
6563                 "%sPrivateTmp: %s\n"
6564                 "%sPrivateDevices: %s\n"
6565                 "%sProtectKernelTunables: %s\n"
6566                 "%sProtectKernelModules: %s\n"
6567                 "%sProtectKernelLogs: %s\n"
6568                 "%sProtectClock: %s\n"
6569                 "%sProtectControlGroups: %s\n"
6570                 "%sPrivateNetwork: %s\n"
6571                 "%sPrivateUsers: %s\n"
6572                 "%sProtectHome: %s\n"
6573                 "%sProtectSystem: %s\n"
6574                 "%sMountAPIVFS: %s\n"
6575                 "%sIgnoreSIGPIPE: %s\n"
6576                 "%sMemoryDenyWriteExecute: %s\n"
6577                 "%sRestrictRealtime: %s\n"
6578                 "%sRestrictSUIDSGID: %s\n"
6579                 "%sKeyringMode: %s\n"
6580                 "%sProtectHostname: %s\n"
6581                 "%sProtectProc: %s\n"
6582                 "%sProcSubset: %s\n",
6583                 prefix, c->umask,
6584                 prefix, empty_to_root(c->working_directory),
6585                 prefix, empty_to_root(c->root_directory),
6586                 prefix, yes_no(c->root_ephemeral),
6587                 prefix, yes_no(c->non_blocking),
6588                 prefix, yes_no(c->private_tmp),
6589                 prefix, yes_no(c->private_devices),
6590                 prefix, yes_no(c->protect_kernel_tunables),
6591                 prefix, yes_no(c->protect_kernel_modules),
6592                 prefix, yes_no(c->protect_kernel_logs),
6593                 prefix, yes_no(c->protect_clock),
6594                 prefix, yes_no(c->protect_control_groups),
6595                 prefix, yes_no(c->private_network),
6596                 prefix, yes_no(c->private_users),
6597                 prefix, protect_home_to_string(c->protect_home),
6598                 prefix, protect_system_to_string(c->protect_system),
6599                 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
6600                 prefix, yes_no(c->ignore_sigpipe),
6601                 prefix, yes_no(c->memory_deny_write_execute),
6602                 prefix, yes_no(c->restrict_realtime),
6603                 prefix, yes_no(c->restrict_suid_sgid),
6604                 prefix, exec_keyring_mode_to_string(c->keyring_mode),
6605                 prefix, yes_no(c->protect_hostname),
6606                 prefix, protect_proc_to_string(c->protect_proc),
6607                 prefix, proc_subset_to_string(c->proc_subset));
6608
6609         if (c->root_image)
6610                 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
6611
6612         if (c->root_image_options) {
6613                 fprintf(f, "%sRootImageOptions:", prefix);
6614                 LIST_FOREACH(mount_options, o, c->root_image_options)
6615                         if (!isempty(o->options))
6616                                 fprintf(f, " %s:%s",
6617                                         partition_designator_to_string(o->partition_designator),
6618                                         o->options);
6619                 fprintf(f, "\n");
6620         }
6621
6622         if (c->root_hash) {
6623                 _cleanup_free_ char *encoded = NULL;
6624                 encoded = hexmem(c->root_hash, c->root_hash_size);
6625                 if (encoded)
6626                         fprintf(f, "%sRootHash: %s\n", prefix, encoded);
6627         }
6628
6629         if (c->root_hash_path)
6630                 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
6631
6632         if (c->root_hash_sig) {
6633                 _cleanup_free_ char *encoded = NULL;
6634                 ssize_t len;
6635                 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
6636                 if (len)
6637                         fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
6638         }
6639
6640         if (c->root_hash_sig_path)
6641                 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
6642
6643         if (c->root_verity)
6644                 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
6645
6646         STRV_FOREACH(e, c->environment)
6647                 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
6648
6649         STRV_FOREACH(e, c->environment_files)
6650                 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
6651
6652         STRV_FOREACH(e, c->pass_environment)
6653                 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
6654
6655         STRV_FOREACH(e, c->unset_environment)
6656                 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
6657
6658         fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
6659
6660         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
6661                 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
6662
6663                 for (size_t i = 0; i < c->directories[dt].n_items; i++) {
6664                         fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].items[i].path);
6665
6666                         STRV_FOREACH(d, c->directories[dt].items[i].symlinks)
6667                                 fprintf(f, "%s%s: %s:%s\n", prefix, exec_directory_type_symlink_to_string(dt), c->directories[dt].items[i].path, *d);
6668                 }
6669         }
6670
6671         fprintf(f, "%sTimeoutCleanSec: %s\n", prefix, FORMAT_TIMESPAN(c->timeout_clean_usec, USEC_PER_SEC));
6672
6673         if (c->nice_set)
6674                 fprintf(f, "%sNice: %i\n", prefix, c->nice);
6675
6676         if (c->oom_score_adjust_set)
6677                 fprintf(f, "%sOOMScoreAdjust: %i\n", prefix, c->oom_score_adjust);
6678
6679         if (c->coredump_filter_set)
6680                 fprintf(f, "%sCoredumpFilter: 0x%"PRIx64"\n", prefix, c->coredump_filter);
6681
6682         for (unsigned i = 0; i < RLIM_NLIMITS; i++)
6683                 if (c->rlimit[i]) {
6684                         fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
6685                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
6686                         fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
6687                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
6688                 }
6689
6690         if (c->ioprio_set) {
6691                 _cleanup_free_ char *class_str = NULL;
6692
6693                 r = ioprio_class_to_string_alloc(ioprio_prio_class(c->ioprio), &class_str);
6694                 if (r >= 0)
6695                         fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
6696
6697                 fprintf(f, "%sIOPriority: %d\n", prefix, ioprio_prio_data(c->ioprio));
6698         }
6699
6700         if (c->cpu_sched_set) {
6701                 _cleanup_free_ char *policy_str = NULL;
6702
6703                 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
6704                 if (r >= 0)
6705                         fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
6706
6707                 fprintf(f,
6708                         "%sCPUSchedulingPriority: %i\n"
6709                         "%sCPUSchedulingResetOnFork: %s\n",
6710                         prefix, c->cpu_sched_priority,
6711                         prefix, yes_no(c->cpu_sched_reset_on_fork));
6712         }
6713
6714         if (c->cpu_set.set) {
6715                 _cleanup_free_ char *affinity = NULL;
6716
6717                 affinity = cpu_set_to_range_string(&c->cpu_set);
6718                 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
6719         }
6720
6721         if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
6722                 _cleanup_free_ char *nodes = NULL;
6723
6724                 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
6725                 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
6726                 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
6727         }
6728
6729         if (c->timer_slack_nsec != NSEC_INFINITY)
6730                 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
6731
6732         fprintf(f,
6733                 "%sStandardInput: %s\n"
6734                 "%sStandardOutput: %s\n"
6735                 "%sStandardError: %s\n",
6736                 prefix, exec_input_to_string(c->std_input),
6737                 prefix, exec_output_to_string(c->std_output),
6738                 prefix, exec_output_to_string(c->std_error));
6739
6740         if (c->std_input == EXEC_INPUT_NAMED_FD)
6741                 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
6742         if (c->std_output == EXEC_OUTPUT_NAMED_FD)
6743                 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
6744         if (c->std_error == EXEC_OUTPUT_NAMED_FD)
6745                 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
6746
6747         if (c->std_input == EXEC_INPUT_FILE)
6748                 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
6749         if (c->std_output == EXEC_OUTPUT_FILE)
6750                 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
6751         if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
6752                 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
6753         if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
6754                 fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
6755         if (c->std_error == EXEC_OUTPUT_FILE)
6756                 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
6757         if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
6758                 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
6759         if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
6760                 fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
6761
6762         if (c->tty_path)
6763                 fprintf(f,
6764                         "%sTTYPath: %s\n"
6765                         "%sTTYReset: %s\n"
6766                         "%sTTYVHangup: %s\n"
6767                         "%sTTYVTDisallocate: %s\n"
6768                         "%sTTYRows: %u\n"
6769                         "%sTTYColumns: %u\n",
6770                         prefix, c->tty_path,
6771                         prefix, yes_no(c->tty_reset),
6772                         prefix, yes_no(c->tty_vhangup),
6773                         prefix, yes_no(c->tty_vt_disallocate),
6774                         prefix, c->tty_rows,
6775                         prefix, c->tty_cols);
6776
6777         if (IN_SET(c->std_output,
6778                    EXEC_OUTPUT_KMSG,
6779                    EXEC_OUTPUT_JOURNAL,
6780                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
6781                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
6782             IN_SET(c->std_error,
6783                    EXEC_OUTPUT_KMSG,
6784                    EXEC_OUTPUT_JOURNAL,
6785                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
6786                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
6787
6788                 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
6789
6790                 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
6791                 if (r >= 0)
6792                         fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
6793
6794                 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
6795                 if (r >= 0)
6796                         fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
6797         }
6798
6799         if (c->log_level_max >= 0) {
6800                 _cleanup_free_ char *t = NULL;
6801
6802                 (void) log_level_to_string_alloc(c->log_level_max, &t);
6803
6804                 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
6805         }
6806
6807         if (c->log_ratelimit_interval_usec > 0)
6808                 fprintf(f,
6809                         "%sLogRateLimitIntervalSec: %s\n",
6810                         prefix, FORMAT_TIMESPAN(c->log_ratelimit_interval_usec, USEC_PER_SEC));
6811
6812         if (c->log_ratelimit_burst > 0)
6813                 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
6814
6815         if (!set_isempty(c->log_filter_allowed_patterns) || !set_isempty(c->log_filter_denied_patterns)) {
6816                 fprintf(f, "%sLogFilterPatterns:", prefix);
6817
6818                 char *pattern;
6819                 SET_FOREACH(pattern, c->log_filter_allowed_patterns)
6820                         fprintf(f, " %s", pattern);
6821                 SET_FOREACH(pattern, c->log_filter_denied_patterns)
6822                         fprintf(f, " ~%s", pattern);
6823                 fputc('\n', f);
6824         }
6825
6826         for (size_t j = 0; j < c->n_log_extra_fields; j++) {
6827                 fprintf(f, "%sLogExtraFields: ", prefix);
6828                 fwrite(c->log_extra_fields[j].iov_base,
6829                        1, c->log_extra_fields[j].iov_len,
6830                        f);
6831                 fputc('\n', f);
6832         }
6833
6834         if (c->log_namespace)
6835                 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
6836
6837         if (c->secure_bits) {
6838                 _cleanup_free_ char *str = NULL;
6839
6840                 r = secure_bits_to_string_alloc(c->secure_bits, &str);
6841                 if (r >= 0)
6842                         fprintf(f, "%sSecure Bits: %s\n", prefix, str);
6843         }
6844
6845         if (c->capability_bounding_set != CAP_MASK_UNSET) {
6846                 _cleanup_free_ char *str = NULL;
6847
6848                 r = capability_set_to_string(c->capability_bounding_set, &str);
6849                 if (r >= 0)
6850                         fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
6851         }
6852
6853         if (c->capability_ambient_set != 0) {
6854                 _cleanup_free_ char *str = NULL;
6855
6856                 r = capability_set_to_string(c->capability_ambient_set, &str);
6857                 if (r >= 0)
6858                         fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
6859         }
6860
6861         if (c->user)
6862                 fprintf(f, "%sUser: %s\n", prefix, c->user);
6863         if (c->group)
6864                 fprintf(f, "%sGroup: %s\n", prefix, c->group);
6865
6866         fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
6867
6868         strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
6869
6870         if (c->pam_name)
6871                 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
6872
6873         strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
6874         strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
6875         strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
6876         strv_dump(f, prefix, "ExecPaths", c->exec_paths);
6877         strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
6878         strv_dump(f, prefix, "ExecSearchPath", c->exec_search_path);
6879
6880         for (size_t i = 0; i < c->n_bind_mounts; i++)
6881                 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
6882                         c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
6883                         c->bind_mounts[i].ignore_enoent ? "-": "",
6884                         c->bind_mounts[i].source,
6885                         c->bind_mounts[i].destination,
6886                         c->bind_mounts[i].recursive ? "rbind" : "norbind");
6887
6888         for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
6889                 const TemporaryFileSystem *t = c->temporary_filesystems + i;
6890
6891                 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
6892                         t->path,
6893                         isempty(t->options) ? "" : ":",
6894                         strempty(t->options));
6895         }
6896
6897         if (c->utmp_id)
6898                 fprintf(f,
6899                         "%sUtmpIdentifier: %s\n",
6900                         prefix, c->utmp_id);
6901
6902         if (c->selinux_context)
6903                 fprintf(f,
6904                         "%sSELinuxContext: %s%s\n",
6905                         prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
6906
6907         if (c->apparmor_profile)
6908                 fprintf(f,
6909                         "%sAppArmorProfile: %s%s\n",
6910                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
6911
6912         if (c->smack_process_label)
6913                 fprintf(f,
6914                         "%sSmackProcessLabel: %s%s\n",
6915                         prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
6916
6917         if (c->personality != PERSONALITY_INVALID)
6918                 fprintf(f,
6919                         "%sPersonality: %s\n",
6920                         prefix, strna(personality_to_string(c->personality)));
6921
6922         fprintf(f,
6923                 "%sLockPersonality: %s\n",
6924                 prefix, yes_no(c->lock_personality));
6925
6926         if (c->syscall_filter) {
6927                 fprintf(f,
6928                         "%sSystemCallFilter: ",
6929                         prefix);
6930
6931                 if (!c->syscall_allow_list)
6932                         fputc('~', f);
6933
6934 #if HAVE_SECCOMP
6935                 void *id, *val;
6936                 bool first = true;
6937                 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
6938                         _cleanup_free_ char *name = NULL;
6939                         const char *errno_name = NULL;
6940                         int num = PTR_TO_INT(val);
6941
6942                         if (first)
6943                                 first = false;
6944                         else
6945                                 fputc(' ', f);
6946
6947                         name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
6948                         fputs(strna(name), f);
6949
6950                         if (num >= 0) {
6951                                 errno_name = seccomp_errno_or_action_to_string(num);
6952                                 if (errno_name)
6953                                         fprintf(f, ":%s", errno_name);
6954                                 else
6955                                         fprintf(f, ":%d", num);
6956                         }
6957                 }
6958 #endif
6959
6960                 fputc('\n', f);
6961         }
6962
6963         if (c->syscall_archs) {
6964                 fprintf(f,
6965                         "%sSystemCallArchitectures:",
6966                         prefix);
6967
6968 #if HAVE_SECCOMP
6969                 void *id;
6970                 SET_FOREACH(id, c->syscall_archs)
6971                         fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
6972 #endif
6973                 fputc('\n', f);
6974         }
6975
6976         if (exec_context_restrict_namespaces_set(c)) {
6977                 _cleanup_free_ char *s = NULL;
6978
6979                 r = namespace_flags_to_string(c->restrict_namespaces, &s);
6980                 if (r >= 0)
6981                         fprintf(f, "%sRestrictNamespaces: %s\n",
6982                                 prefix, strna(s));
6983         }
6984
6985 #if HAVE_LIBBPF
6986         if (exec_context_restrict_filesystems_set(c)) {
6987                 char *fs;
6988                 SET_FOREACH(fs, c->restrict_filesystems)
6989                         fprintf(f, "%sRestrictFileSystems: %s\n", prefix, fs);
6990         }
6991 #endif
6992
6993         if (c->network_namespace_path)
6994                 fprintf(f,
6995                         "%sNetworkNamespacePath: %s\n",
6996                         prefix, c->network_namespace_path);
6997
6998         if (c->syscall_errno > 0) {
6999                 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
7000
7001 #if HAVE_SECCOMP
7002                 const char *errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
7003                 if (errno_name)
7004                         fputs(errno_name, f);
7005                 else
7006                         fprintf(f, "%d", c->syscall_errno);
7007 #endif
7008                 fputc('\n', f);
7009         }
7010
7011         for (size_t i = 0; i < c->n_mount_images; i++) {
7012                 fprintf(f, "%sMountImages: %s%s:%s", prefix,
7013                         c->mount_images[i].ignore_enoent ? "-": "",
7014                         c->mount_images[i].source,
7015                         c->mount_images[i].destination);
7016                 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
7017                         fprintf(f, ":%s:%s",
7018                                 partition_designator_to_string(o->partition_designator),
7019                                 strempty(o->options));
7020                 fprintf(f, "\n");
7021         }
7022
7023         for (size_t i = 0; i < c->n_extension_images; i++) {
7024                 fprintf(f, "%sExtensionImages: %s%s", prefix,
7025                         c->extension_images[i].ignore_enoent ? "-": "",
7026                         c->extension_images[i].source);
7027                 LIST_FOREACH(mount_options, o, c->extension_images[i].mount_options)
7028                         fprintf(f, ":%s:%s",
7029                                 partition_designator_to_string(o->partition_designator),
7030                                 strempty(o->options));
7031                 fprintf(f, "\n");
7032         }
7033
7034         strv_dump(f, prefix, "ExtensionDirectories", c->extension_directories);
7035 }
7036
7037 bool exec_context_maintains_privileges(const ExecContext *c) {
7038         assert(c);
7039
7040         /* Returns true if the process forked off would run under
7041          * an unchanged UID or as root. */
7042
7043         if (!c->user)
7044                 return true;
7045
7046         if (streq(c->user, "root") || streq(c->user, "0"))
7047                 return true;
7048
7049         return false;
7050 }
7051
7052 int exec_context_get_effective_ioprio(const ExecContext *c) {
7053         int p;
7054
7055         assert(c);
7056
7057         if (c->ioprio_set)
7058                 return c->ioprio;
7059
7060         p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
7061         if (p < 0)
7062                 return IOPRIO_DEFAULT_CLASS_AND_PRIO;
7063
7064         return ioprio_normalize(p);
7065 }
7066
7067 bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
7068         assert(c);
7069
7070         /* Explicit setting wins */
7071         if (c->mount_apivfs_set)
7072                 return c->mount_apivfs;
7073
7074         /* Default to "yes" if root directory or image are specified */
7075         if (exec_context_with_rootfs(c))
7076                 return true;
7077
7078         return false;
7079 }
7080
7081 void exec_context_free_log_extra_fields(ExecContext *c) {
7082         assert(c);
7083
7084         for (size_t l = 0; l < c->n_log_extra_fields; l++)
7085                 free(c->log_extra_fields[l].iov_base);
7086         c->log_extra_fields = mfree(c->log_extra_fields);
7087         c->n_log_extra_fields = 0;
7088 }
7089
7090 void exec_context_revert_tty(ExecContext *c) {
7091         _cleanup_close_ int fd = -EBADF;
7092         const char *path;
7093         struct stat st;
7094         int r;
7095
7096         assert(c);
7097
7098         /* First, reset the TTY (possibly kicking everybody else from the TTY) */
7099         exec_context_tty_reset(c, NULL);
7100
7101         /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
7102          * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
7103          * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
7104         if (!exec_context_may_touch_tty(c))
7105                 return;
7106
7107         path = exec_context_tty_path(c);
7108         if (!path)
7109                 return;
7110
7111         fd = open(path, O_PATH|O_CLOEXEC);
7112         if (fd < 0)
7113                 return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
7114                                              "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
7115                                              path);
7116
7117         if (fstat(fd, &st) < 0)
7118                 return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
7119
7120         /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
7121          * if things are a character device, since a proper check either means we'd have to open the TTY and
7122          * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
7123          * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
7124          * with this at all? → https://github.com/systemd/systemd/issues/19213 */
7125         if (!S_ISCHR(st.st_mode))
7126                 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
7127
7128         r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
7129         if (r < 0)
7130                 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
7131 }
7132
7133 int exec_context_get_clean_directories(
7134                 ExecContext *c,
7135                 char **prefix,
7136                 ExecCleanMask mask,
7137                 char ***ret) {
7138
7139         _cleanup_strv_free_ char **l = NULL;
7140         int r;
7141
7142         assert(c);
7143         assert(prefix);
7144         assert(ret);
7145
7146         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
7147                 if (!FLAGS_SET(mask, 1U << t))
7148                         continue;
7149
7150                 if (!prefix[t])
7151                         continue;
7152
7153                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
7154                         char *j;
7155
7156                         j = path_join(prefix[t], c->directories[t].items[i].path);
7157                         if (!j)
7158                                 return -ENOMEM;
7159
7160                         r = strv_consume(&l, j);
7161                         if (r < 0)
7162                                 return r;
7163
7164                         /* Also remove private directories unconditionally. */
7165                         if (t != EXEC_DIRECTORY_CONFIGURATION) {
7166                                 j = path_join(prefix[t], "private", c->directories[t].items[i].path);
7167                                 if (!j)
7168                                         return -ENOMEM;
7169
7170                                 r = strv_consume(&l, j);
7171                                 if (r < 0)
7172                                         return r;
7173                         }
7174
7175                         STRV_FOREACH(symlink, c->directories[t].items[i].symlinks) {
7176                                 j = path_join(prefix[t], *symlink);
7177                                 if (!j)
7178                                         return -ENOMEM;
7179
7180                                 r = strv_consume(&l, j);
7181                                 if (r < 0)
7182                                         return r;
7183                         }
7184                 }
7185         }
7186
7187         *ret = TAKE_PTR(l);
7188         return 0;
7189 }
7190
7191 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
7192         ExecCleanMask mask = 0;
7193
7194         assert(c);
7195         assert(ret);
7196
7197         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
7198                 if (c->directories[t].n_items > 0)
7199                         mask |= 1U << t;
7200
7201         *ret = mask;
7202         return 0;
7203 }
7204
7205 bool exec_context_has_encrypted_credentials(ExecContext *c) {
7206         ExecLoadCredential *load_cred;
7207         ExecSetCredential *set_cred;
7208
7209         assert(c);
7210
7211         HASHMAP_FOREACH(load_cred, c->load_credentials)
7212                 if (load_cred->encrypted)
7213                         return true;
7214
7215         HASHMAP_FOREACH(set_cred, c->set_credentials)
7216                 if (set_cred->encrypted)
7217                         return true;
7218
7219         return false;
7220 }
7221
7222 int exec_context_add_default_dependencies(Unit *u, const ExecContext *c) {
7223         assert(u);
7224         assert(u->default_dependencies);
7225
7226         if (c && exec_context_needs_term(c))
7227                 return unit_add_dependency_by_name(u, UNIT_AFTER, SPECIAL_VCONSOLE_SETUP_SERVICE,
7228                                                    /* add_reference= */ true, UNIT_DEPENDENCY_DEFAULT);
7229         return 0;
7230 }
7231
7232 void exec_status_start(ExecStatus *s, pid_t pid) {
7233         assert(s);
7234
7235         *s = (ExecStatus) {
7236                 .pid = pid,
7237         };
7238
7239         dual_timestamp_get(&s->start_timestamp);
7240 }
7241
7242 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
7243         assert(s);
7244
7245         if (s->pid != pid)
7246                 *s = (ExecStatus) {
7247                         .pid = pid,
7248                 };
7249
7250         dual_timestamp_get(&s->exit_timestamp);
7251
7252         s->code = code;
7253         s->status = status;
7254
7255         if (context && context->utmp_id)
7256                 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
7257 }
7258
7259 void exec_status_reset(ExecStatus *s) {
7260         assert(s);
7261
7262         *s = (ExecStatus) {};
7263 }
7264
7265 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
7266         assert(s);
7267         assert(f);
7268
7269         if (s->pid <= 0)
7270                 return;
7271
7272         prefix = strempty(prefix);
7273
7274         fprintf(f,
7275                 "%sPID: "PID_FMT"\n",
7276                 prefix, s->pid);
7277
7278         if (dual_timestamp_is_set(&s->start_timestamp))
7279                 fprintf(f,
7280                         "%sStart Timestamp: %s\n",
7281                         prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
7282
7283         if (dual_timestamp_is_set(&s->exit_timestamp))
7284                 fprintf(f,
7285                         "%sExit Timestamp: %s\n"
7286                         "%sExit Code: %s\n"
7287                         "%sExit Status: %i\n",
7288                         prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime),
7289                         prefix, sigchld_code_to_string(s->code),
7290                         prefix, s->status);
7291 }
7292
7293 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
7294         _cleanup_free_ char *cmd = NULL;
7295         const char *prefix2;
7296
7297         assert(c);
7298         assert(f);
7299
7300         prefix = strempty(prefix);
7301         prefix2 = strjoina(prefix, "\t");
7302
7303         cmd = quote_command_line(c->argv, SHELL_ESCAPE_EMPTY);
7304
7305         fprintf(f,
7306                 "%sCommand Line: %s\n",
7307                 prefix, strnull(cmd));
7308
7309         exec_status_dump(&c->exec_status, f, prefix2);
7310 }
7311
7312 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
7313         assert(f);
7314
7315         prefix = strempty(prefix);
7316
7317         LIST_FOREACH(command, i, c)
7318                 exec_command_dump(i, f, prefix);
7319 }
7320
7321 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
7322         ExecCommand *end;
7323
7324         assert(l);
7325         assert(e);
7326
7327         if (*l) {
7328                 /* It's kind of important, that we keep the order here */
7329                 end = LIST_FIND_TAIL(command, *l);
7330                 LIST_INSERT_AFTER(command, *l, end, e);
7331         } else
7332                 *l = e;
7333 }
7334
7335 int exec_command_set(ExecCommand *c, const char *path, ...) {
7336         va_list ap;
7337         char **l, *p;
7338
7339         assert(c);
7340         assert(path);
7341
7342         va_start(ap, path);
7343         l = strv_new_ap(path, ap);
7344         va_end(ap);
7345
7346         if (!l)
7347                 return -ENOMEM;
7348
7349         p = strdup(path);
7350         if (!p) {
7351                 strv_free(l);
7352                 return -ENOMEM;
7353         }
7354
7355         free_and_replace(c->path, p);
7356
7357         return strv_free_and_replace(c->argv, l);
7358 }
7359
7360 int exec_command_append(ExecCommand *c, const char *path, ...) {
7361         _cleanup_strv_free_ char **l = NULL;
7362         va_list ap;
7363         int r;
7364
7365         assert(c);
7366         assert(path);
7367
7368         va_start(ap, path);
7369         l = strv_new_ap(path, ap);
7370         va_end(ap);
7371
7372         if (!l)
7373                 return -ENOMEM;
7374
7375         r = strv_extend_strv(&c->argv, l, false);
7376         if (r < 0)
7377                 return r;
7378
7379         return 0;
7380 }
7381
7382 static char *destroy_tree(char *path) {
7383         if (!path)
7384                 return NULL;
7385
7386         if (!path_equal(path, RUN_SYSTEMD_EMPTY)) {
7387                 log_debug("Spawning process to nuke '%s'", path);
7388
7389                 (void) asynchronous_rm_rf(path, REMOVE_ROOT|REMOVE_SUBVOLUME|REMOVE_PHYSICAL);
7390         }
7391
7392         return mfree(path);
7393 }
7394
7395 static ExecSharedRuntime* exec_shared_runtime_free(ExecSharedRuntime *rt) {
7396         if (!rt)
7397                 return NULL;
7398
7399         if (rt->manager)
7400                 (void) hashmap_remove(rt->manager->exec_shared_runtime_by_id, rt->id);
7401
7402         rt->id = mfree(rt->id);
7403         rt->tmp_dir = mfree(rt->tmp_dir);
7404         rt->var_tmp_dir = mfree(rt->var_tmp_dir);
7405         safe_close_pair(rt->netns_storage_socket);
7406         safe_close_pair(rt->ipcns_storage_socket);
7407         return mfree(rt);
7408 }
7409
7410 DEFINE_TRIVIAL_UNREF_FUNC(ExecSharedRuntime, exec_shared_runtime, exec_shared_runtime_free);
7411 DEFINE_TRIVIAL_CLEANUP_FUNC(ExecSharedRuntime*, exec_shared_runtime_free);
7412
7413 ExecSharedRuntime* exec_shared_runtime_destroy(ExecSharedRuntime *rt) {
7414         if (!rt)
7415                 return NULL;
7416
7417         assert(rt->n_ref > 0);
7418         rt->n_ref--;
7419
7420         if (rt->n_ref > 0)
7421                 return NULL;
7422
7423         rt->tmp_dir = destroy_tree(rt->tmp_dir);
7424         rt->var_tmp_dir = destroy_tree(rt->var_tmp_dir);
7425
7426         return exec_shared_runtime_free(rt);
7427 }
7428
7429 static int exec_shared_runtime_allocate(ExecSharedRuntime **ret, const char *id) {
7430         _cleanup_free_ char *id_copy = NULL;
7431         ExecSharedRuntime *n;
7432
7433         assert(ret);
7434
7435         id_copy = strdup(id);
7436         if (!id_copy)
7437                 return -ENOMEM;
7438
7439         n = new(ExecSharedRuntime, 1);
7440         if (!n)
7441                 return -ENOMEM;
7442
7443         *n = (ExecSharedRuntime) {
7444                 .id = TAKE_PTR(id_copy),
7445                 .netns_storage_socket = PIPE_EBADF,
7446                 .ipcns_storage_socket = PIPE_EBADF,
7447         };
7448
7449         *ret = n;
7450         return 0;
7451 }
7452
7453 static int exec_shared_runtime_add(
7454                 Manager *m,
7455                 const char *id,
7456                 char **tmp_dir,
7457                 char **var_tmp_dir,
7458                 int netns_storage_socket[2],
7459                 int ipcns_storage_socket[2],
7460                 ExecSharedRuntime **ret) {
7461
7462         _cleanup_(exec_shared_runtime_freep) ExecSharedRuntime *rt = NULL;
7463         int r;
7464
7465         assert(m);
7466         assert(id);
7467
7468         /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
7469
7470         r = exec_shared_runtime_allocate(&rt, id);
7471         if (r < 0)
7472                 return r;
7473
7474         r = hashmap_ensure_put(&m->exec_shared_runtime_by_id, &string_hash_ops, rt->id, rt);
7475         if (r < 0)
7476                 return r;
7477
7478         assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
7479         rt->tmp_dir = TAKE_PTR(*tmp_dir);
7480         rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
7481
7482         if (netns_storage_socket) {
7483                 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
7484                 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
7485         }
7486
7487         if (ipcns_storage_socket) {
7488                 rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
7489                 rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
7490         }
7491
7492         rt->manager = m;
7493
7494         if (ret)
7495                 *ret = rt;
7496         /* do not remove created ExecSharedRuntime object when the operation succeeds. */
7497         TAKE_PTR(rt);
7498         return 0;
7499 }
7500
7501 static int exec_shared_runtime_make(
7502                 Manager *m,
7503                 const ExecContext *c,
7504                 const char *id,
7505                 ExecSharedRuntime **ret) {
7506
7507         _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
7508         _cleanup_close_pair_ int netns_storage_socket[2] = PIPE_EBADF, ipcns_storage_socket[2] = PIPE_EBADF;
7509         int r;
7510
7511         assert(m);
7512         assert(c);
7513         assert(id);
7514
7515         /* It is not necessary to create ExecSharedRuntime object. */
7516         if (!exec_needs_network_namespace(c) && !exec_needs_ipc_namespace(c) && !c->private_tmp) {
7517                 *ret = NULL;
7518                 return 0;
7519         }
7520
7521         if (c->private_tmp &&
7522             !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
7523               (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
7524                prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
7525                 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
7526                 if (r < 0)
7527                         return r;
7528         }
7529
7530         if (exec_needs_network_namespace(c)) {
7531                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
7532                         return -errno;
7533         }
7534
7535         if (exec_needs_ipc_namespace(c)) {
7536                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
7537                         return -errno;
7538         }
7539
7540         r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
7541         if (r < 0)
7542                 return r;
7543
7544         return 1;
7545 }
7546
7547 int exec_shared_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecSharedRuntime **ret) {
7548         ExecSharedRuntime *rt;
7549         int r;
7550
7551         assert(m);
7552         assert(id);
7553         assert(ret);
7554
7555         rt = hashmap_get(m->exec_shared_runtime_by_id, id);
7556         if (rt)
7557                 /* We already have an ExecSharedRuntime object, let's increase the ref count and reuse it */
7558                 goto ref;
7559
7560         if (!create) {
7561                 *ret = NULL;
7562                 return 0;
7563         }
7564
7565         /* If not found, then create a new object. */
7566         r = exec_shared_runtime_make(m, c, id, &rt);
7567         if (r < 0)
7568                 return r;
7569         if (r == 0) {
7570                 /* When r == 0, it is not necessary to create ExecSharedRuntime object. */
7571                 *ret = NULL;
7572                 return 0;
7573         }
7574
7575 ref:
7576         /* increment reference counter. */
7577         rt->n_ref++;
7578         *ret = rt;
7579         return 1;
7580 }
7581
7582 int exec_shared_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
7583         ExecSharedRuntime *rt;
7584
7585         assert(m);
7586         assert(f);
7587         assert(fds);
7588
7589         HASHMAP_FOREACH(rt, m->exec_shared_runtime_by_id) {
7590                 fprintf(f, "exec-runtime=%s", rt->id);
7591
7592                 if (rt->tmp_dir)
7593                         fprintf(f, " tmp-dir=%s", rt->tmp_dir);
7594
7595                 if (rt->var_tmp_dir)
7596                         fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
7597
7598                 if (rt->netns_storage_socket[0] >= 0) {
7599                         int copy;
7600
7601                         copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
7602                         if (copy < 0)
7603                                 return copy;
7604
7605                         fprintf(f, " netns-socket-0=%i", copy);
7606                 }
7607
7608                 if (rt->netns_storage_socket[1] >= 0) {
7609                         int copy;
7610
7611                         copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
7612                         if (copy < 0)
7613                                 return copy;
7614
7615                         fprintf(f, " netns-socket-1=%i", copy);
7616                 }
7617
7618                 if (rt->ipcns_storage_socket[0] >= 0) {
7619                         int copy;
7620
7621                         copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
7622                         if (copy < 0)
7623                                 return copy;
7624
7625                         fprintf(f, " ipcns-socket-0=%i", copy);
7626                 }
7627
7628                 if (rt->ipcns_storage_socket[1] >= 0) {
7629                         int copy;
7630
7631                         copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
7632                         if (copy < 0)
7633                                 return copy;
7634
7635                         fprintf(f, " ipcns-socket-1=%i", copy);
7636                 }
7637
7638                 fputc('\n', f);
7639         }
7640
7641         return 0;
7642 }
7643
7644 int exec_shared_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
7645         _cleanup_(exec_shared_runtime_freep) ExecSharedRuntime *rt_create = NULL;
7646         ExecSharedRuntime *rt;
7647         int r;
7648
7649         /* This is for the migration from old (v237 or earlier) deserialization text.
7650          * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
7651          * Even if the ExecSharedRuntime object originally created by the other unit, we cannot judge
7652          * so or not from the serialized text, then we always creates a new object owned by this. */
7653
7654         assert(u);
7655         assert(key);
7656         assert(value);
7657
7658         /* Manager manages ExecSharedRuntime objects by the unit id.
7659          * So, we omit the serialized text when the unit does not have id (yet?)... */
7660         if (isempty(u->id)) {
7661                 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
7662                 return 0;
7663         }
7664
7665         if (hashmap_ensure_allocated(&u->manager->exec_shared_runtime_by_id, &string_hash_ops) < 0)
7666                 return log_oom();
7667
7668         rt = hashmap_get(u->manager->exec_shared_runtime_by_id, u->id);
7669         if (!rt) {
7670                 if (exec_shared_runtime_allocate(&rt_create, u->id) < 0)
7671                         return log_oom();
7672
7673                 rt = rt_create;
7674         }
7675
7676         if (streq(key, "tmp-dir")) {
7677                 if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
7678                         return -ENOMEM;
7679
7680         } else if (streq(key, "var-tmp-dir")) {
7681                 if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
7682                         return -ENOMEM;
7683
7684         } else if (streq(key, "netns-socket-0")) {
7685                 int fd;
7686
7687                 if ((fd = parse_fd(value)) < 0 || !fdset_contains(fds, fd)) {
7688                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
7689                         return 0;
7690                 }
7691
7692                 safe_close(rt->netns_storage_socket[0]);
7693                 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
7694
7695         } else if (streq(key, "netns-socket-1")) {
7696                 int fd;
7697
7698                 if ((fd = parse_fd(value)) < 0 || !fdset_contains(fds, fd)) {
7699                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
7700                         return 0;
7701                 }
7702
7703                 safe_close(rt->netns_storage_socket[1]);
7704                 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
7705
7706         } else
7707                 return 0;
7708
7709         /* If the object is newly created, then put it to the hashmap which manages ExecSharedRuntime objects. */
7710         if (rt_create) {
7711                 r = hashmap_put(u->manager->exec_shared_runtime_by_id, rt_create->id, rt_create);
7712                 if (r < 0) {
7713                         log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
7714                         return 0;
7715                 }
7716
7717                 rt_create->manager = u->manager;
7718
7719                 /* Avoid cleanup */
7720                 TAKE_PTR(rt_create);
7721         }
7722
7723         return 1;
7724 }
7725
7726 int exec_shared_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
7727         _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
7728         char *id = NULL;
7729         int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
7730         const char *p, *v = ASSERT_PTR(value);
7731         size_t n;
7732
7733         assert(m);
7734         assert(fds);
7735
7736         n = strcspn(v, " ");
7737         id = strndupa_safe(v, n);
7738         if (v[n] != ' ')
7739                 goto finalize;
7740         p = v + n + 1;
7741
7742         v = startswith(p, "tmp-dir=");
7743         if (v) {
7744                 n = strcspn(v, " ");
7745                 tmp_dir = strndup(v, n);
7746                 if (!tmp_dir)
7747                         return log_oom();
7748                 if (v[n] != ' ')
7749                         goto finalize;
7750                 p = v + n + 1;
7751         }
7752
7753         v = startswith(p, "var-tmp-dir=");
7754         if (v) {
7755                 n = strcspn(v, " ");
7756                 var_tmp_dir = strndup(v, n);
7757                 if (!var_tmp_dir)
7758                         return log_oom();
7759                 if (v[n] != ' ')
7760                         goto finalize;
7761                 p = v + n + 1;
7762         }
7763
7764         v = startswith(p, "netns-socket-0=");
7765         if (v) {
7766                 char *buf;
7767
7768                 n = strcspn(v, " ");
7769                 buf = strndupa_safe(v, n);
7770
7771                 netns_fdpair[0] = parse_fd(buf);
7772                 if (netns_fdpair[0] < 0)
7773                         return log_debug_errno(netns_fdpair[0], "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
7774                 if (!fdset_contains(fds, netns_fdpair[0]))
7775                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7776                                                "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair[0]);
7777                 netns_fdpair[0] = fdset_remove(fds, netns_fdpair[0]);
7778                 if (v[n] != ' ')
7779                         goto finalize;
7780                 p = v + n + 1;
7781         }
7782
7783         v = startswith(p, "netns-socket-1=");
7784         if (v) {
7785                 char *buf;
7786
7787                 n = strcspn(v, " ");
7788                 buf = strndupa_safe(v, n);
7789
7790                 netns_fdpair[1] = parse_fd(buf);
7791                 if (netns_fdpair[1] < 0)
7792                         return log_debug_errno(netns_fdpair[1], "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
7793                 if (!fdset_contains(fds, netns_fdpair[1]))
7794                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7795                                                "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair[1]);
7796                 netns_fdpair[1] = fdset_remove(fds, netns_fdpair[1]);
7797                 if (v[n] != ' ')
7798                         goto finalize;
7799                 p = v + n + 1;
7800         }
7801
7802         v = startswith(p, "ipcns-socket-0=");
7803         if (v) {
7804                 char *buf;
7805
7806                 n = strcspn(v, " ");
7807                 buf = strndupa_safe(v, n);
7808
7809                 ipcns_fdpair[0] = parse_fd(buf);
7810                 if (ipcns_fdpair[0] < 0)
7811                         return log_debug_errno(ipcns_fdpair[0], "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf);
7812                 if (!fdset_contains(fds, ipcns_fdpair[0]))
7813                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7814                                                "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair[0]);
7815                 ipcns_fdpair[0] = fdset_remove(fds, ipcns_fdpair[0]);
7816                 if (v[n] != ' ')
7817                         goto finalize;
7818                 p = v + n + 1;
7819         }
7820
7821         v = startswith(p, "ipcns-socket-1=");
7822         if (v) {
7823                 char *buf;
7824
7825                 n = strcspn(v, " ");
7826                 buf = strndupa_safe(v, n);
7827
7828                 ipcns_fdpair[1] = parse_fd(buf);
7829                 if (ipcns_fdpair[1] < 0)
7830                         return log_debug_errno(ipcns_fdpair[1], "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf);
7831                 if (!fdset_contains(fds, ipcns_fdpair[1]))
7832                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7833                                                "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair[1]);
7834                 ipcns_fdpair[1] = fdset_remove(fds, ipcns_fdpair[1]);
7835         }
7836
7837 finalize:
7838         r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
7839         if (r < 0)
7840                 return log_debug_errno(r, "Failed to add exec-runtime: %m");
7841         return 0;
7842 }
7843
7844 void exec_shared_runtime_vacuum(Manager *m) {
7845         ExecSharedRuntime *rt;
7846
7847         assert(m);
7848
7849         /* Free unreferenced ExecSharedRuntime objects. This is used after manager deserialization process. */
7850
7851         HASHMAP_FOREACH(rt, m->exec_shared_runtime_by_id) {
7852                 if (rt->n_ref > 0)
7853                         continue;
7854
7855                 (void) exec_shared_runtime_free(rt);
7856         }
7857 }
7858
7859 int exec_runtime_make(
7860                 const Unit *unit,
7861                 const ExecContext *context,
7862                 ExecSharedRuntime *shared,
7863                 DynamicCreds *creds,
7864                 ExecRuntime **ret) {
7865         _cleanup_close_pair_ int ephemeral_storage_socket[2] = PIPE_EBADF;
7866         _cleanup_free_ char *ephemeral = NULL;
7867         _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
7868         int r;
7869
7870         assert(unit);
7871         assert(context);
7872         assert(ret);
7873
7874         if (!shared && !creds && !exec_needs_ephemeral(context)) {
7875                 *ret = NULL;
7876                 return 0;
7877         }
7878
7879         if (exec_needs_ephemeral(context)) {
7880                 r = mkdir_p("/var/lib/systemd/ephemeral-trees", 0755);
7881                 if (r < 0)
7882                         return r;
7883
7884                 r = tempfn_random_child("/var/lib/systemd/ephemeral-trees", unit->id, &ephemeral);
7885                 if (r < 0)
7886                         return r;
7887
7888                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ephemeral_storage_socket) < 0)
7889                         return -errno;
7890         }
7891
7892         rt = new(ExecRuntime, 1);
7893         if (!rt)
7894                 return -ENOMEM;
7895
7896         *rt = (ExecRuntime) {
7897                 .shared = shared,
7898                 .dynamic_creds = creds,
7899                 .ephemeral_copy = TAKE_PTR(ephemeral),
7900                 .ephemeral_storage_socket[0] = TAKE_FD(ephemeral_storage_socket[0]),
7901                 .ephemeral_storage_socket[1] = TAKE_FD(ephemeral_storage_socket[1]),
7902         };
7903
7904         *ret = TAKE_PTR(rt);
7905         return 1;
7906 }
7907
7908 ExecRuntime* exec_runtime_free(ExecRuntime *rt) {
7909         if (!rt)
7910                 return NULL;
7911
7912         exec_shared_runtime_unref(rt->shared);
7913         dynamic_creds_unref(rt->dynamic_creds);
7914
7915         rt->ephemeral_copy = destroy_tree(rt->ephemeral_copy);
7916
7917         safe_close_pair(rt->ephemeral_storage_socket);
7918         return mfree(rt);
7919 }
7920
7921 ExecRuntime* exec_runtime_destroy(ExecRuntime *rt) {
7922         if (!rt)
7923                 return NULL;
7924
7925         rt->shared = exec_shared_runtime_destroy(rt->shared);
7926         rt->dynamic_creds = dynamic_creds_destroy(rt->dynamic_creds);
7927         return exec_runtime_free(rt);
7928 }
7929
7930 void exec_params_clear(ExecParameters *p) {
7931         if (!p)
7932                 return;
7933
7934         p->environment = strv_free(p->environment);
7935         p->fd_names = strv_free(p->fd_names);
7936         p->fds = mfree(p->fds);
7937         p->exec_fd = safe_close(p->exec_fd);
7938 }
7939
7940 ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) {
7941         if (!sc)
7942                 return NULL;
7943
7944         free(sc->id);
7945         free(sc->data);
7946         return mfree(sc);
7947 }
7948
7949 ExecLoadCredential *exec_load_credential_free(ExecLoadCredential *lc) {
7950         if (!lc)
7951                 return NULL;
7952
7953         free(lc->id);
7954         free(lc->path);
7955         return mfree(lc);
7956 }
7957
7958 void exec_directory_done(ExecDirectory *d) {
7959         if (!d)
7960                 return;
7961
7962         for (size_t i = 0; i < d->n_items; i++) {
7963                 free(d->items[i].path);
7964                 strv_free(d->items[i].symlinks);
7965         }
7966
7967         d->items = mfree(d->items);
7968         d->n_items = 0;
7969         d->mode = 0755;
7970 }
7971
7972 static ExecDirectoryItem *exec_directory_find(ExecDirectory *d, const char *path) {
7973         assert(d);
7974         assert(path);
7975
7976         for (size_t i = 0; i < d->n_items; i++)
7977                 if (path_equal(d->items[i].path, path))
7978                         return &d->items[i];
7979
7980         return NULL;
7981 }
7982
7983 int exec_directory_add(ExecDirectory *d, const char *path, const char *symlink) {
7984         _cleanup_strv_free_ char **s = NULL;
7985         _cleanup_free_ char *p = NULL;
7986         ExecDirectoryItem *existing;
7987         int r;
7988
7989         assert(d);
7990         assert(path);
7991
7992         existing = exec_directory_find(d, path);
7993         if (existing) {
7994                 r = strv_extend(&existing->symlinks, symlink);
7995                 if (r < 0)
7996                         return r;
7997
7998                 return 0; /* existing item is updated */
7999         }
8000
8001         p = strdup(path);
8002         if (!p)
8003                 return -ENOMEM;
8004
8005         if (symlink) {
8006                 s = strv_new(symlink);
8007                 if (!s)
8008                         return -ENOMEM;
8009         }
8010
8011         if (!GREEDY_REALLOC(d->items, d->n_items + 1))
8012                 return -ENOMEM;
8013
8014         d->items[d->n_items++] = (ExecDirectoryItem) {
8015                 .path = TAKE_PTR(p),
8016                 .symlinks = TAKE_PTR(s),
8017         };
8018
8019         return 1; /* new item is added */
8020 }
8021
8022 static int exec_directory_item_compare_func(const ExecDirectoryItem *a, const ExecDirectoryItem *b) {
8023         assert(a);
8024         assert(b);
8025
8026         return path_compare(a->path, b->path);
8027 }
8028
8029 void exec_directory_sort(ExecDirectory *d) {
8030         assert(d);
8031
8032         /* Sort the exec directories to make always parent directories processed at first in
8033          * setup_exec_directory(), e.g., even if StateDirectory=foo/bar foo, we need to create foo at first,
8034          * then foo/bar. Also, set .only_create flag if one of the parent directories is contained in the
8035          * list. See also comments in setup_exec_directory() and issue #24783. */
8036
8037         if (d->n_items <= 1)
8038                 return;
8039
8040         typesafe_qsort(d->items, d->n_items, exec_directory_item_compare_func);
8041
8042         for (size_t i = 1; i < d->n_items; i++)
8043                 for (size_t j = 0; j < i; j++)
8044                         if (path_startswith(d->items[i].path, d->items[j].path)) {
8045                                 d->items[i].only_create = true;
8046                                 break;
8047                         }
8048 }
8049
8050 ExecCleanMask exec_clean_mask_from_string(const char *s) {
8051         ExecDirectoryType t;
8052
8053         assert(s);
8054
8055         if (streq(s, "all"))
8056                 return EXEC_CLEAN_ALL;
8057         if (streq(s, "fdstore"))
8058                 return EXEC_CLEAN_FDSTORE;
8059
8060         t = exec_resource_type_from_string(s);
8061         if (t < 0)
8062                 return (ExecCleanMask) t;
8063
8064         return 1U << t;
8065 }
8066
8067 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_set_credential_hash_ops, char, string_hash_func, string_compare_func, ExecSetCredential, exec_set_credential_free);
8068 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_load_credential_hash_ops, char, string_hash_func, string_compare_func, ExecLoadCredential, exec_load_credential_free);
8069
8070 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
8071         [EXEC_INPUT_NULL] = "null",
8072         [EXEC_INPUT_TTY] = "tty",
8073         [EXEC_INPUT_TTY_FORCE] = "tty-force",
8074         [EXEC_INPUT_TTY_FAIL] = "tty-fail",
8075         [EXEC_INPUT_SOCKET] = "socket",
8076         [EXEC_INPUT_NAMED_FD] = "fd",
8077         [EXEC_INPUT_DATA] = "data",
8078         [EXEC_INPUT_FILE] = "file",
8079 };
8080
8081 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
8082
8083 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
8084         [EXEC_OUTPUT_INHERIT] = "inherit",
8085         [EXEC_OUTPUT_NULL] = "null",
8086         [EXEC_OUTPUT_TTY] = "tty",
8087         [EXEC_OUTPUT_KMSG] = "kmsg",
8088         [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
8089         [EXEC_OUTPUT_JOURNAL] = "journal",
8090         [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
8091         [EXEC_OUTPUT_SOCKET] = "socket",
8092         [EXEC_OUTPUT_NAMED_FD] = "fd",
8093         [EXEC_OUTPUT_FILE] = "file",
8094         [EXEC_OUTPUT_FILE_APPEND] = "append",
8095         [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
8096 };
8097
8098 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
8099
8100 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
8101         [EXEC_UTMP_INIT] = "init",
8102         [EXEC_UTMP_LOGIN] = "login",
8103         [EXEC_UTMP_USER] = "user",
8104 };
8105
8106 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
8107
8108 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
8109         [EXEC_PRESERVE_NO] = "no",
8110         [EXEC_PRESERVE_YES] = "yes",
8111         [EXEC_PRESERVE_RESTART] = "restart",
8112 };
8113
8114 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
8115
8116 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
8117 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
8118         [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
8119         [EXEC_DIRECTORY_STATE] = "StateDirectory",
8120         [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
8121         [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
8122         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
8123 };
8124
8125 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
8126
8127 /* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
8128 static const char* const exec_directory_type_symlink_table[_EXEC_DIRECTORY_TYPE_MAX] = {
8129         [EXEC_DIRECTORY_RUNTIME]       = "RuntimeDirectorySymlink",
8130         [EXEC_DIRECTORY_STATE]         = "StateDirectorySymlink",
8131         [EXEC_DIRECTORY_CACHE]         = "CacheDirectorySymlink",
8132         [EXEC_DIRECTORY_LOGS]          = "LogsDirectorySymlink",
8133         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectorySymlink",
8134 };
8135
8136 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink, ExecDirectoryType);
8137
8138 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
8139  * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
8140  * directories, specifically .timer units with their timestamp touch file. */
8141 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
8142         [EXEC_DIRECTORY_RUNTIME] = "runtime",
8143         [EXEC_DIRECTORY_STATE] = "state",
8144         [EXEC_DIRECTORY_CACHE] = "cache",
8145         [EXEC_DIRECTORY_LOGS] = "logs",
8146         [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
8147 };
8148
8149 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
8150
8151 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
8152  * the service payload in. */
8153 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
8154         [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
8155         [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
8156         [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
8157         [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
8158         [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
8159 };
8160
8161 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
8162
8163 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
8164         [EXEC_KEYRING_INHERIT] = "inherit",
8165         [EXEC_KEYRING_PRIVATE] = "private",
8166         [EXEC_KEYRING_SHARED] = "shared",
8167 };
8168
8169 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);