src/core/execute.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <errno.h>
   4 #include <fcntl.h>
   5 #include <poll.h>
   6 #include <sys/eventfd.h>
   7 #include <sys/ioctl.h>
   8 #include <sys/mman.h>
   9 #include <sys/mount.h>
  10 #include <sys/personality.h>
  11 #include <sys/prctl.h>
  12 #include <sys/shm.h>
  13 #include <sys/types.h>
  14 #include <sys/un.h>
  15 #include <unistd.h>
  16 #include <utmpx.h>
  17
  18 #include <linux/fs.h> /* Must be included after <sys/mount.h> */
  19
  20 #if HAVE_PAM
  21 #include <security/pam_appl.h>
  22 #endif
  23
  24 #if HAVE_SELINUX
  25 #include <selinux/selinux.h>
  26 #endif
  27
  28 #if HAVE_SECCOMP
  29 #include <seccomp.h>
  30 #endif
  31
  32 #if HAVE_APPARMOR
  33 #include <sys/apparmor.h>
  34 #endif
  35
  36 #include "sd-messages.h"
  37
  38 #include "acl-util.h"
  39 #include "af-list.h"
  40 #include "alloc-util.h"
  41 #if HAVE_APPARMOR
  42 #include "apparmor-util.h"
  43 #endif
  44 #include "argv-util.h"
  45 #include "async.h"
  46 #include "barrier.h"
  47 #include "bpf-lsm.h"
  48 #include "btrfs-util.h"
  49 #include "cap-list.h"
  50 #include "capability-util.h"
  51 #include "chattr-util.h"
  52 #include "cgroup-setup.h"
  53 #include "chase.h"
  54 #include "chown-recursive.h"
  55 #include "constants.h"
  56 #include "cpu-set-util.h"
  57 #include "creds-util.h"
  58 #include "data-fd-util.h"
  59 #include "env-file.h"
  60 #include "env-util.h"
  61 #include "errno-list.h"
  62 #include "escape.h"
  63 #include "execute.h"
  64 #include "exit-status.h"
  65 #include "fd-util.h"
  66 #include "fileio.h"
  67 #include "format-util.h"
  68 #include "glob-util.h"
  69 #include "hexdecoct.h"
  70 #include "io-util.h"
  71 #include "ioprio-util.h"
  72 #include "label-util.h"
  73 #include "lock-util.h"
  74 #include "log.h"
  75 #include "macro.h"
  76 #include "manager.h"
  77 #include "manager-dump.h"
  78 #include "memory-util.h"
  79 #include "missing_fs.h"
  80 #include "missing_ioprio.h"
  81 #include "missing_prctl.h"
  82 #include "mkdir-label.h"
  83 #include "mount-util.h"
  84 #include "mountpoint-util.h"
  85 #include "namespace.h"
  86 #include "parse-util.h"
  87 #include "path-util.h"
  88 #include "proc-cmdline.h"
  89 #include "process-util.h"
  90 #include "psi-util.h"
  91 #include "random-util.h"
  92 #include "recurse-dir.h"
  93 #include "rlimit-util.h"
  94 #include "rm-rf.h"
  95 #if HAVE_SECCOMP
  96 #include "seccomp-util.h"
  97 #endif
  98 #include "securebits-util.h"
  99 #include "selinux-util.h"
 100 #include "signal-util.h"
 101 #include "smack-util.h"
 102 #include "socket-util.h"
 103 #include "sort-util.h"
 104 #include "special.h"
 105 #include "stat-util.h"
 106 #include "string-table.h"
 107 #include "string-util.h"
 108 #include "strv.h"
 109 #include "syslog-util.h"
 110 #include "terminal-util.h"
 111 #include "tmpfile-util.h"
 112 #include "umask-util.h"
 113 #include "unit-serialize.h"
 114 #include "user-util.h"
 115 #include "utmp-wtmp.h"
 116
 117 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
 118 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
 119
 120 #define SNDBUF_SIZE (8*1024*1024)
 121
 122 static int shift_fds(int fds[], size_t n_fds) {
 123         if (n_fds <= 0)
 124                 return 0;
 125
 126         /* Modifies the fds array! (sorts it) */
 127
 128         assert(fds);
 129
 130         for (int start = 0;;) {
 131                 int restart_from = -1;
 132
 133                 for (int i = start; i < (int) n_fds; i++) {
 134                         int nfd;
 135
 136                         /* Already at right index? */
 137                         if (fds[i] == i+3)
 138                                 continue;
 139
 140                         nfd = fcntl(fds[i], F_DUPFD, i + 3);
 141                         if (nfd < 0)
 142                                 return -errno;
 143
 144                         safe_close(fds[i]);
 145                         fds[i] = nfd;
 146
 147                         /* Hmm, the fd we wanted isn't free? Then
 148                          * let's remember that and try again from here */
 149                         if (nfd != i+3 && restart_from < 0)
 150                                 restart_from = i;
 151                 }
 152
 153                 if (restart_from < 0)
 154                         break;
 155
 156                 start = restart_from;
 157         }
 158
 159         return 0;
 160 }
 161
 162 static int flags_fds(
 163                 const int fds[],
 164                 size_t n_socket_fds,
 165                 size_t n_fds,
 166                 bool nonblock) {
 167
 168         int r;
 169
 170         if (n_fds <= 0)
 171                 return 0;
 172
 173         assert(fds);
 174
 175         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
 176          * O_NONBLOCK only applies to socket activation though. */
 177
 178         for (size_t i = 0; i < n_fds; i++) {
 179
 180                 if (i < n_socket_fds) {
 181                         r = fd_nonblock(fds[i], nonblock);
 182                         if (r < 0)
 183                                 return r;
 184                 }
 185
 186                 /* We unconditionally drop FD_CLOEXEC from the fds,
 187                  * since after all we want to pass these fds to our
 188                  * children */
 189
 190                 r = fd_cloexec(fds[i], false);
 191                 if (r < 0)
 192                         return r;
 193         }
 194
 195         return 0;
 196 }
 197
 198 static const char *exec_context_tty_path(const ExecContext *context) {
 199         assert(context);
 200
 201         if (context->stdio_as_fds)
 202                 return NULL;
 203
 204         if (context->tty_path)
 205                 return context->tty_path;
 206
 207         return "/dev/console";
 208 }
 209
 210 static int exec_context_tty_size(const ExecContext *context, unsigned *ret_rows, unsigned *ret_cols) {
 211         unsigned rows, cols;
 212         const char *tty;
 213
 214         assert(context);
 215         assert(ret_rows);
 216         assert(ret_cols);
 217
 218         rows = context->tty_rows;
 219         cols = context->tty_cols;
 220
 221         tty = exec_context_tty_path(context);
 222         if (tty)
 223                 (void) proc_cmdline_tty_size(tty, rows == UINT_MAX ? &rows : NULL, cols == UINT_MAX ? &cols : NULL);
 224
 225         *ret_rows = rows;
 226         *ret_cols = cols;
 227
 228         return 0;
 229 }
 230
 231 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
 232         _cleanup_close_ int fd = -EBADF;
 233         const char *path = exec_context_tty_path(ASSERT_PTR(context));
 234
 235         /* Take a lock around the device for the duration of the setup that we do here.
 236          * systemd-vconsole-setup.service also takes the lock to avoid being interrupted.
 237          * We open a new fd that will be closed automatically, and operate on it for convenience.
 238          */
 239
 240         if (p && p->stdin_fd >= 0) {
 241                 fd = xopenat_lock(p->stdin_fd, NULL,
 242                                   O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY, 0, 0, LOCK_BSD, LOCK_EX);
 243                 if (fd < 0)
 244                         return;
 245         } else if (path) {
 246                 fd = open_terminal(path, O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK);
 247                 if (fd < 0)
 248                         return;
 249
 250                 if (lock_generic(fd, LOCK_BSD, LOCK_EX) < 0)
 251                         return;
 252         } else
 253                 return;   /* nothing to do */
 254
 255         if (context->tty_vhangup)
 256                 (void) terminal_vhangup_fd(fd);
 257
 258         if (context->tty_reset)
 259                 (void) reset_terminal_fd(fd, true);
 260
 261         if (p && p->stdin_fd >= 0) {
 262                 unsigned rows = context->tty_rows, cols = context->tty_cols;
 263
 264                 (void) exec_context_tty_size(context, &rows, &cols);
 265                 (void) terminal_set_size_fd(p->stdin_fd, path, rows, cols);
 266         }
 267
 268         if (context->tty_vt_disallocate && path)
 269                 (void) vt_disallocate(path);
 270 }
 271
 272 static bool is_terminal_input(ExecInput i) {
 273         return IN_SET(i,
 274                       EXEC_INPUT_TTY,
 275                       EXEC_INPUT_TTY_FORCE,
 276                       EXEC_INPUT_TTY_FAIL);
 277 }
 278
 279 static bool is_terminal_output(ExecOutput o) {
 280         return IN_SET(o,
 281                       EXEC_OUTPUT_TTY,
 282                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 283                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 284 }
 285
 286 static bool is_kmsg_output(ExecOutput o) {
 287         return IN_SET(o,
 288                       EXEC_OUTPUT_KMSG,
 289                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 290 }
 291
 292 static bool exec_context_needs_term(const ExecContext *c) {
 293         assert(c);
 294
 295         /* Return true if the execution context suggests we should set $TERM to something useful. */
 296
 297         if (is_terminal_input(c->std_input))
 298                 return true;
 299
 300         if (is_terminal_output(c->std_output))
 301                 return true;
 302
 303         if (is_terminal_output(c->std_error))
 304                 return true;
 305
 306         return !!c->tty_path;
 307 }
 308
 309 static int open_null_as(int flags, int nfd) {
 310         int fd;
 311
 312         assert(nfd >= 0);
 313
 314         fd = open("/dev/null", flags|O_NOCTTY);
 315         if (fd < 0)
 316                 return -errno;
 317
 318         return move_fd(fd, nfd, false);
 319 }
 320
 321 static int connect_journal_socket(
 322                 int fd,
 323                 const char *log_namespace,
 324                 uid_t uid,
 325                 gid_t gid) {
 326
 327         uid_t olduid = UID_INVALID;
 328         gid_t oldgid = GID_INVALID;
 329         const char *j;
 330         int r;
 331
 332         j = log_namespace ?
 333                 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
 334                 "/run/systemd/journal/stdout";
 335
 336         if (gid_is_valid(gid)) {
 337                 oldgid = getgid();
 338
 339                 if (setegid(gid) < 0)
 340                         return -errno;
 341         }
 342
 343         if (uid_is_valid(uid)) {
 344                 olduid = getuid();
 345
 346                 if (seteuid(uid) < 0) {
 347                         r = -errno;
 348                         goto restore_gid;
 349                 }
 350         }
 351
 352         r = connect_unix_path(fd, AT_FDCWD, j);
 353
 354         /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
 355            an LSM interferes. */
 356
 357         if (uid_is_valid(uid))
 358                 (void) seteuid(olduid);
 359
 360  restore_gid:
 361         if (gid_is_valid(gid))
 362                 (void) setegid(oldgid);
 363
 364         return r;
 365 }
 366
 367 static int connect_logger_as(
 368                 const Unit *unit,
 369                 const ExecContext *context,
 370                 const ExecParameters *params,
 371                 ExecOutput output,
 372                 const char *ident,
 373                 int nfd,
 374                 uid_t uid,
 375                 gid_t gid) {
 376
 377         _cleanup_close_ int fd = -EBADF;
 378         int r;
 379
 380         assert(context);
 381         assert(params);
 382         assert(output < _EXEC_OUTPUT_MAX);
 383         assert(ident);
 384         assert(nfd >= 0);
 385
 386         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 387         if (fd < 0)
 388                 return -errno;
 389
 390         r = connect_journal_socket(fd, context->log_namespace, uid, gid);
 391         if (r < 0)
 392                 return r;
 393
 394         if (shutdown(fd, SHUT_RD) < 0)
 395                 return -errno;
 396
 397         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 398
 399         if (dprintf(fd,
 400                 "%s\n"
 401                 "%s\n"
 402                 "%i\n"
 403                 "%i\n"
 404                 "%i\n"
 405                 "%i\n"
 406                 "%i\n",
 407                 context->syslog_identifier ?: ident,
 408                 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
 409                 context->syslog_priority,
 410                 !!context->syslog_level_prefix,
 411                 false,
 412                 is_kmsg_output(output),
 413                 is_terminal_output(output)) < 0)
 414                 return -errno;
 415
 416         return move_fd(TAKE_FD(fd), nfd, false);
 417 }
 418
 419 static int open_terminal_as(const char *path, int flags, int nfd) {
 420         int fd;
 421
 422         assert(path);
 423         assert(nfd >= 0);
 424
 425         fd = open_terminal(path, flags | O_NOCTTY);
 426         if (fd < 0)
 427                 return fd;
 428
 429         return move_fd(fd, nfd, false);
 430 }
 431
 432 static int acquire_path(const char *path, int flags, mode_t mode) {
 433         _cleanup_close_ int fd = -EBADF;
 434         int r;
 435
 436         assert(path);
 437
 438         if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
 439                 flags |= O_CREAT;
 440
 441         fd = open(path, flags|O_NOCTTY, mode);
 442         if (fd >= 0)
 443                 return TAKE_FD(fd);
 444
 445         if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
 446                 return -errno;
 447
 448         /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
 449
 450         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 451         if (fd < 0)
 452                 return -errno;
 453
 454         r = connect_unix_path(fd, AT_FDCWD, path);
 455         if (IN_SET(r, -ENOTSOCK, -EINVAL))
 456                 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
 457                  * wasn't an AF_UNIX socket after all */
 458                 return -ENXIO;
 459         if (r < 0)
 460                 return r;
 461
 462         if ((flags & O_ACCMODE) == O_RDONLY)
 463                 r = shutdown(fd, SHUT_WR);
 464         else if ((flags & O_ACCMODE) == O_WRONLY)
 465                 r = shutdown(fd, SHUT_RD);
 466         else
 467                 r = 0;
 468         if (r < 0)
 469                 return -errno;
 470
 471         return TAKE_FD(fd);
 472 }
 473
 474 static int fixup_input(
 475                 const ExecContext *context,
 476                 int socket_fd,
 477                 bool apply_tty_stdin) {
 478
 479         ExecInput std_input;
 480
 481         assert(context);
 482
 483         std_input = context->std_input;
 484
 485         if (is_terminal_input(std_input) && !apply_tty_stdin)
 486                 return EXEC_INPUT_NULL;
 487
 488         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 489                 return EXEC_INPUT_NULL;
 490
 491         if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
 492                 return EXEC_INPUT_NULL;
 493
 494         return std_input;
 495 }
 496
 497 static int fixup_output(ExecOutput output, int socket_fd) {
 498
 499         if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 500                 return EXEC_OUTPUT_INHERIT;
 501
 502         return output;
 503 }
 504
 505 static int setup_input(
 506                 const ExecContext *context,
 507                 const ExecParameters *params,
 508                 int socket_fd,
 509                 const int named_iofds[static 3]) {
 510
 511         ExecInput i;
 512         int r;
 513
 514         assert(context);
 515         assert(params);
 516         assert(named_iofds);
 517
 518         if (params->stdin_fd >= 0) {
 519                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 520                         return -errno;
 521
 522                 /* Try to make this the controlling tty, if it is a tty, and reset it */
 523                 if (isatty(STDIN_FILENO)) {
 524                         unsigned rows = context->tty_rows, cols = context->tty_cols;
 525
 526                         (void) exec_context_tty_size(context, &rows, &cols);
 527                         (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 528                         (void) reset_terminal_fd(STDIN_FILENO, true);
 529                         (void) terminal_set_size_fd(STDIN_FILENO, NULL, rows, cols);
 530                 }
 531
 532                 return STDIN_FILENO;
 533         }
 534
 535         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 536
 537         switch (i) {
 538
 539         case EXEC_INPUT_NULL:
 540                 return open_null_as(O_RDONLY, STDIN_FILENO);
 541
 542         case EXEC_INPUT_TTY:
 543         case EXEC_INPUT_TTY_FORCE:
 544         case EXEC_INPUT_TTY_FAIL: {
 545                 unsigned rows, cols;
 546                 int fd;
 547
 548                 fd = acquire_terminal(exec_context_tty_path(context),
 549                                       i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
 550                                       i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
 551                                                                   ACQUIRE_TERMINAL_WAIT,
 552                                       USEC_INFINITY);
 553                 if (fd < 0)
 554                         return fd;
 555
 556                 r = exec_context_tty_size(context, &rows, &cols);
 557                 if (r < 0)
 558                         return r;
 559
 560                 r = terminal_set_size_fd(fd, exec_context_tty_path(context), rows, cols);
 561                 if (r < 0)
 562                         return r;
 563
 564                 return move_fd(fd, STDIN_FILENO, false);
 565         }
 566
 567         case EXEC_INPUT_SOCKET:
 568                 assert(socket_fd >= 0);
 569
 570                 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
 571
 572         case EXEC_INPUT_NAMED_FD:
 573                 assert(named_iofds[STDIN_FILENO] >= 0);
 574
 575                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 576                 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
 577
 578         case EXEC_INPUT_DATA: {
 579                 int fd;
 580
 581                 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
 582                 if (fd < 0)
 583                         return fd;
 584
 585                 return move_fd(fd, STDIN_FILENO, false);
 586         }
 587
 588         case EXEC_INPUT_FILE: {
 589                 bool rw;
 590                 int fd;
 591
 592                 assert(context->stdio_file[STDIN_FILENO]);
 593
 594                 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
 595                         (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
 596
 597                 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
 598                 if (fd < 0)
 599                         return fd;
 600
 601                 return move_fd(fd, STDIN_FILENO, false);
 602         }
 603
 604         default:
 605                 assert_not_reached();
 606         }
 607 }
 608
 609 static bool can_inherit_stderr_from_stdout(
 610                 const ExecContext *context,
 611                 ExecOutput o,
 612                 ExecOutput e) {
 613
 614         assert(context);
 615
 616         /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
 617          * stderr fd */
 618
 619         if (e == EXEC_OUTPUT_INHERIT)
 620                 return true;
 621         if (e != o)
 622                 return false;
 623
 624         if (e == EXEC_OUTPUT_NAMED_FD)
 625                 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
 626
 627         if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
 628                 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
 629
 630         return true;
 631 }
 632
 633 static int setup_output(
 634                 const Unit *unit,
 635                 const ExecContext *context,
 636                 const ExecParameters *params,
 637                 int fileno,
 638                 int socket_fd,
 639                 const int named_iofds[static 3],
 640                 const char *ident,
 641                 uid_t uid,
 642                 gid_t gid,
 643                 dev_t *journal_stream_dev,
 644                 ino_t *journal_stream_ino) {
 645
 646         ExecOutput o;
 647         ExecInput i;
 648         int r;
 649
 650         assert(unit);
 651         assert(context);
 652         assert(params);
 653         assert(ident);
 654         assert(journal_stream_dev);
 655         assert(journal_stream_ino);
 656
 657         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 658
 659                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 660                         return -errno;
 661
 662                 return STDOUT_FILENO;
 663         }
 664
 665         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 666                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 667                         return -errno;
 668
 669                 return STDERR_FILENO;
 670         }
 671
 672         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 673         o = fixup_output(context->std_output, socket_fd);
 674
 675         if (fileno == STDERR_FILENO) {
 676                 ExecOutput e;
 677                 e = fixup_output(context->std_error, socket_fd);
 678
 679                 /* This expects the input and output are already set up */
 680
 681                 /* Don't change the stderr file descriptor if we inherit all
 682                  * the way and are not on a tty */
 683                 if (e == EXEC_OUTPUT_INHERIT &&
 684                     o == EXEC_OUTPUT_INHERIT &&
 685                     i == EXEC_INPUT_NULL &&
 686                     !is_terminal_input(context->std_input) &&
 687                     getppid() != 1)
 688                         return fileno;
 689
 690                 /* Duplicate from stdout if possible */
 691                 if (can_inherit_stderr_from_stdout(context, o, e))
 692                         return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
 693
 694                 o = e;
 695
 696         } else if (o == EXEC_OUTPUT_INHERIT) {
 697                 /* If input got downgraded, inherit the original value */
 698                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 699                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 700
 701                 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
 702                 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
 703                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 704
 705                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 706                 if (getppid() != 1)
 707                         return fileno;
 708
 709                 /* We need to open /dev/null here anew, to get the right access mode. */
 710                 return open_null_as(O_WRONLY, fileno);
 711         }
 712
 713         switch (o) {
 714
 715         case EXEC_OUTPUT_NULL:
 716                 return open_null_as(O_WRONLY, fileno);
 717
 718         case EXEC_OUTPUT_TTY:
 719                 if (is_terminal_input(i))
 720                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 721
 722                 /* We don't reset the terminal if this is just about output */
 723                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 724
 725         case EXEC_OUTPUT_KMSG:
 726         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 727         case EXEC_OUTPUT_JOURNAL:
 728         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 729                 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
 730                 if (r < 0) {
 731                         log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m",
 732                                                fileno == STDOUT_FILENO ? "stdout" : "stderr");
 733                         r = open_null_as(O_WRONLY, fileno);
 734                 } else {
 735                         struct stat st;
 736
 737                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 738                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 739                          * services to detect whether they are connected to the journal or not.
 740                          *
 741                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
 742                          * about STDERR as that's usually the best way to do logging. */
 743
 744                         if (fstat(fileno, &st) >= 0 &&
 745                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
 746                                 *journal_stream_dev = st.st_dev;
 747                                 *journal_stream_ino = st.st_ino;
 748                         }
 749                 }
 750                 return r;
 751
 752         case EXEC_OUTPUT_SOCKET:
 753                 assert(socket_fd >= 0);
 754
 755                 return RET_NERRNO(dup2(socket_fd, fileno));
 756
 757         case EXEC_OUTPUT_NAMED_FD:
 758                 assert(named_iofds[fileno] >= 0);
 759
 760                 (void) fd_nonblock(named_iofds[fileno], false);
 761                 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
 762
 763         case EXEC_OUTPUT_FILE:
 764         case EXEC_OUTPUT_FILE_APPEND:
 765         case EXEC_OUTPUT_FILE_TRUNCATE: {
 766                 bool rw;
 767                 int fd, flags;
 768
 769                 assert(context->stdio_file[fileno]);
 770
 771                 rw = context->std_input == EXEC_INPUT_FILE &&
 772                         streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
 773
 774                 if (rw)
 775                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 776
 777                 flags = O_WRONLY;
 778                 if (o == EXEC_OUTPUT_FILE_APPEND)
 779                         flags |= O_APPEND;
 780                 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
 781                         flags |= O_TRUNC;
 782
 783                 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
 784                 if (fd < 0)
 785                         return fd;
 786
 787                 return move_fd(fd, fileno, 0);
 788         }
 789
 790         default:
 791                 assert_not_reached();
 792         }
 793 }
 794
 795 static int chown_terminal(int fd, uid_t uid) {
 796         int r;
 797
 798         assert(fd >= 0);
 799
 800         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 801         if (isatty(fd) < 1) {
 802                 if (IN_SET(errno, EINVAL, ENOTTY))
 803                         return 0; /* not a tty */
 804
 805                 return -errno;
 806         }
 807
 808         /* This might fail. What matters are the results. */
 809         r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
 810         if (r < 0)
 811                 return r;
 812
 813         return 1;
 814 }
 815
 816 static int setup_confirm_stdio(
 817                 const ExecContext *context,
 818                 const char *vc,
 819                 int *ret_saved_stdin,
 820                 int *ret_saved_stdout) {
 821
 822         _cleanup_close_ int fd = -EBADF, saved_stdin = -EBADF, saved_stdout = -EBADF;
 823         unsigned rows, cols;
 824         int r;
 825
 826         assert(ret_saved_stdin);
 827         assert(ret_saved_stdout);
 828
 829         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 830         if (saved_stdin < 0)
 831                 return -errno;
 832
 833         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
 834         if (saved_stdout < 0)
 835                 return -errno;
 836
 837         fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
 838         if (fd < 0)
 839                 return fd;
 840
 841         r = chown_terminal(fd, getuid());
 842         if (r < 0)
 843                 return r;
 844
 845         r = reset_terminal_fd(fd, true);
 846         if (r < 0)
 847                 return r;
 848
 849         r = exec_context_tty_size(context, &rows, &cols);
 850         if (r < 0)
 851                 return r;
 852
 853         r = terminal_set_size_fd(fd, vc, rows, cols);
 854         if (r < 0)
 855                 return r;
 856
 857         r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
 858         TAKE_FD(fd);
 859         if (r < 0)
 860                 return r;
 861
 862         *ret_saved_stdin = TAKE_FD(saved_stdin);
 863         *ret_saved_stdout = TAKE_FD(saved_stdout);
 864         return 0;
 865 }
 866
 867 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
 868         assert(err < 0);
 869
 870         if (err == -ETIMEDOUT)
 871                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
 872         else {
 873                 errno = -err;
 874                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
 875         }
 876 }
 877
 878 static void write_confirm_error(int err, const char *vc, const Unit *u) {
 879         _cleanup_close_ int fd = -EBADF;
 880
 881         assert(vc);
 882
 883         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 884         if (fd < 0)
 885                 return;
 886
 887         write_confirm_error_fd(err, fd, u);
 888 }
 889
 890 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 891         int r = 0;
 892
 893         assert(saved_stdin);
 894         assert(saved_stdout);
 895
 896         release_terminal();
 897
 898         if (*saved_stdin >= 0)
 899                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 900                         r = -errno;
 901
 902         if (*saved_stdout >= 0)
 903                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 904                         r = -errno;
 905
 906         *saved_stdin = safe_close(*saved_stdin);
 907         *saved_stdout = safe_close(*saved_stdout);
 908
 909         return r;
 910 }
 911
 912 enum {
 913         CONFIRM_PRETEND_FAILURE = -1,
 914         CONFIRM_PRETEND_SUCCESS =  0,
 915         CONFIRM_EXECUTE = 1,
 916 };
 917
 918 static int ask_for_confirmation(const ExecContext *context, const char *vc, Unit *u, const char *cmdline) {
 919         int saved_stdout = -1, saved_stdin = -1, r;
 920         _cleanup_free_ char *e = NULL;
 921         char c;
 922
 923         /* For any internal errors, assume a positive response. */
 924         r = setup_confirm_stdio(context, vc, &saved_stdin, &saved_stdout);
 925         if (r < 0) {
 926                 write_confirm_error(r, vc, u);
 927                 return CONFIRM_EXECUTE;
 928         }
 929
 930         /* confirm_spawn might have been disabled while we were sleeping. */
 931         if (manager_is_confirm_spawn_disabled(u->manager)) {
 932                 r = 1;
 933                 goto restore_stdio;
 934         }
 935
 936         e = ellipsize(cmdline, 60, 100);
 937         if (!e) {
 938                 log_oom();
 939                 r = CONFIRM_EXECUTE;
 940                 goto restore_stdio;
 941         }
 942
 943         for (;;) {
 944                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 945                 if (r < 0) {
 946                         write_confirm_error_fd(r, STDOUT_FILENO, u);
 947                         r = CONFIRM_EXECUTE;
 948                         goto restore_stdio;
 949                 }
 950
 951                 switch (c) {
 952                 case 'c':
 953                         printf("Resuming normal execution.\n");
 954                         manager_disable_confirm_spawn();
 955                         r = 1;
 956                         break;
 957                 case 'D':
 958                         unit_dump(u, stdout, "  ");
 959                         continue; /* ask again */
 960                 case 'f':
 961                         printf("Failing execution.\n");
 962                         r = CONFIRM_PRETEND_FAILURE;
 963                         break;
 964                 case 'h':
 965                         printf("  c - continue, proceed without asking anymore\n"
 966                                "  D - dump, show the state of the unit\n"
 967                                "  f - fail, don't execute the command and pretend it failed\n"
 968                                "  h - help\n"
 969                                "  i - info, show a short summary of the unit\n"
 970                                "  j - jobs, show jobs that are in progress\n"
 971                                "  s - skip, don't execute the command and pretend it succeeded\n"
 972                                "  y - yes, execute the command\n");
 973                         continue; /* ask again */
 974                 case 'i':
 975                         printf("  Description: %s\n"
 976                                "  Unit:        %s\n"
 977                                "  Command:     %s\n",
 978                                u->id, u->description, cmdline);
 979                         continue; /* ask again */
 980                 case 'j':
 981                         manager_dump_jobs(u->manager, stdout, /* patterns= */ NULL, "  ");
 982                         continue; /* ask again */
 983                 case 'n':
 984                         /* 'n' was removed in favor of 'f'. */
 985                         printf("Didn't understand 'n', did you mean 'f'?\n");
 986                         continue; /* ask again */
 987                 case 's':
 988                         printf("Skipping execution.\n");
 989                         r = CONFIRM_PRETEND_SUCCESS;
 990                         break;
 991                 case 'y':
 992                         r = CONFIRM_EXECUTE;
 993                         break;
 994                 default:
 995                         assert_not_reached();
 996                 }
 997                 break;
 998         }
 999
1000 restore_stdio:
1001         restore_confirm_stdio(&saved_stdin, &saved_stdout);
1002         return r;
1003 }
1004
1005 static int get_fixed_user(const ExecContext *c, const char **user,
1006                           uid_t *uid, gid_t *gid,
1007                           const char **home, const char **shell) {
1008         int r;
1009         const char *name;
1010
1011         assert(c);
1012
1013         if (!c->user)
1014                 return 0;
1015
1016         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
1017          * (i.e. are "/" or "/bin/nologin"). */
1018
1019         name = c->user;
1020         r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
1021         if (r < 0)
1022                 return r;
1023
1024         *user = name;
1025         return 0;
1026 }
1027
1028 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
1029         int r;
1030         const char *name;
1031
1032         assert(c);
1033
1034         if (!c->group)
1035                 return 0;
1036
1037         name = c->group;
1038         r = get_group_creds(&name, gid, 0);
1039         if (r < 0)
1040                 return r;
1041
1042         *group = name;
1043         return 0;
1044 }
1045
1046 static int get_supplementary_groups(const ExecContext *c, const char *user,
1047                                     const char *group, gid_t gid,
1048                                     gid_t **supplementary_gids, int *ngids) {
1049         int r, k = 0;
1050         int ngroups_max;
1051         bool keep_groups = false;
1052         gid_t *groups = NULL;
1053         _cleanup_free_ gid_t *l_gids = NULL;
1054
1055         assert(c);
1056
1057         /*
1058          * If user is given, then lookup GID and supplementary groups list.
1059          * We avoid NSS lookups for gid=0. Also we have to initialize groups
1060          * here and as early as possible so we keep the list of supplementary
1061          * groups of the caller.
1062          */
1063         if (user && gid_is_valid(gid) && gid != 0) {
1064                 /* First step, initialize groups from /etc/groups */
1065                 if (initgroups(user, gid) < 0)
1066                         return -errno;
1067
1068                 keep_groups = true;
1069         }
1070
1071         if (strv_isempty(c->supplementary_groups))
1072                 return 0;
1073
1074         /*
1075          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1076          * be positive, otherwise fail.
1077          */
1078         errno = 0;
1079         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1080         if (ngroups_max <= 0)
1081                 return errno_or_else(EOPNOTSUPP);
1082
1083         l_gids = new(gid_t, ngroups_max);
1084         if (!l_gids)
1085                 return -ENOMEM;
1086
1087         if (keep_groups) {
1088                 /*
1089                  * Lookup the list of groups that the user belongs to, we
1090                  * avoid NSS lookups here too for gid=0.
1091                  */
1092                 k = ngroups_max;
1093                 if (getgrouplist(user, gid, l_gids, &k) < 0)
1094                         return -EINVAL;
1095         } else
1096                 k = 0;
1097
1098         STRV_FOREACH(i, c->supplementary_groups) {
1099                 const char *g;
1100
1101                 if (k >= ngroups_max)
1102                         return -E2BIG;
1103
1104                 g = *i;
1105                 r = get_group_creds(&g, l_gids+k, 0);
1106                 if (r < 0)
1107                         return r;
1108
1109                 k++;
1110         }
1111
1112         /*
1113          * Sets ngids to zero to drop all supplementary groups, happens
1114          * when we are under root and SupplementaryGroups= is empty.
1115          */
1116         if (k == 0) {
1117                 *ngids = 0;
1118                 return 0;
1119         }
1120
1121         /* Otherwise get the final list of supplementary groups */
1122         groups = memdup(l_gids, sizeof(gid_t) * k);
1123         if (!groups)
1124                 return -ENOMEM;
1125
1126         *supplementary_gids = groups;
1127         *ngids = k;
1128
1129         groups = NULL;
1130
1131         return 0;
1132 }
1133
1134 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1135         int r;
1136
1137         /* Handle SupplementaryGroups= if it is not empty */
1138         if (ngids > 0) {
1139                 r = maybe_setgroups(ngids, supplementary_gids);
1140                 if (r < 0)
1141                         return r;
1142         }
1143
1144         if (gid_is_valid(gid)) {
1145                 /* Then set our gids */
1146                 if (setresgid(gid, gid, gid) < 0)
1147                         return -errno;
1148         }
1149
1150         return 0;
1151 }
1152
1153 static int set_securebits(unsigned bits, unsigned mask) {
1154         unsigned applied;
1155         int current;
1156
1157         current = prctl(PR_GET_SECUREBITS);
1158         if (current < 0)
1159                 return -errno;
1160
1161         /* Clear all securebits defined in mask and set bits */
1162         applied = ((unsigned) current & ~mask) | bits;
1163         if ((unsigned) current == applied)
1164                 return 0;
1165
1166         if (prctl(PR_SET_SECUREBITS, applied) < 0)
1167                 return -errno;
1168
1169         return 1;
1170 }
1171
1172 static int enforce_user(
1173                 const ExecContext *context,
1174                 uid_t uid,
1175                 uint64_t capability_ambient_set) {
1176         assert(context);
1177         int r;
1178
1179         if (!uid_is_valid(uid))
1180                 return 0;
1181
1182         /* Sets (but doesn't look up) the UIS and makes sure we keep the capabilities while doing so. For
1183          * setting secure bits the capability CAP_SETPCAP is required, so we also need keep-caps in this
1184          * case. */
1185
1186         if ((capability_ambient_set != 0 || context->secure_bits != 0) && uid != 0) {
1187
1188                 /* First step: If we need to keep capabilities but drop privileges we need to make sure we
1189                  * keep our caps, while we drop privileges. Add KEEP_CAPS to the securebits */
1190                 r = set_securebits(1U << SECURE_KEEP_CAPS, 0);
1191                 if (r < 0)
1192                         return r;
1193         }
1194
1195         /* Second step: actually set the uids */
1196         if (setresuid(uid, uid, uid) < 0)
1197                 return -errno;
1198
1199         /* At this point we should have all necessary capabilities but are otherwise a normal user. However,
1200          * the caps might got corrupted due to the setresuid() so we need clean them up later. This is done
1201          * outside of this call. */
1202         return 0;
1203 }
1204
1205 #if HAVE_PAM
1206
1207 static int null_conv(
1208                 int num_msg,
1209                 const struct pam_message **msg,
1210                 struct pam_response **resp,
1211                 void *appdata_ptr) {
1212
1213         /* We don't support conversations */
1214
1215         return PAM_CONV_ERR;
1216 }
1217
1218 #endif
1219
1220 static int setup_pam(
1221                 const char *name,
1222                 const char *user,
1223                 uid_t uid,
1224                 gid_t gid,
1225                 const char *tty,
1226                 char ***env, /* updated on success */
1227                 const int fds[], size_t n_fds) {
1228
1229 #if HAVE_PAM
1230
1231         static const struct pam_conv conv = {
1232                 .conv = null_conv,
1233                 .appdata_ptr = NULL
1234         };
1235
1236         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1237         _cleanup_strv_free_ char **e = NULL;
1238         pam_handle_t *handle = NULL;
1239         sigset_t old_ss;
1240         int pam_code = PAM_SUCCESS, r;
1241         bool close_session = false;
1242         pid_t pam_pid = 0, parent_pid;
1243         int flags = 0;
1244
1245         assert(name);
1246         assert(user);
1247         assert(env);
1248
1249         /* We set up PAM in the parent process, then fork. The child
1250          * will then stay around until killed via PR_GET_PDEATHSIG or
1251          * systemd via the cgroup logic. It will then remove the PAM
1252          * session again. The parent process will exec() the actual
1253          * daemon. We do things this way to ensure that the main PID
1254          * of the daemon is the one we initially fork()ed. */
1255
1256         r = barrier_create(&barrier);
1257         if (r < 0)
1258                 goto fail;
1259
1260         if (log_get_max_level() < LOG_DEBUG)
1261                 flags |= PAM_SILENT;
1262
1263         pam_code = pam_start(name, user, &conv, &handle);
1264         if (pam_code != PAM_SUCCESS) {
1265                 handle = NULL;
1266                 goto fail;
1267         }
1268
1269         if (!tty) {
1270                 _cleanup_free_ char *q = NULL;
1271
1272                 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1273                  * out if that's the case, and read the TTY off it. */
1274
1275                 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1276                         tty = strjoina("/dev/", q);
1277         }
1278
1279         if (tty) {
1280                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1281                 if (pam_code != PAM_SUCCESS)
1282                         goto fail;
1283         }
1284
1285         STRV_FOREACH(nv, *env) {
1286                 pam_code = pam_putenv(handle, *nv);
1287                 if (pam_code != PAM_SUCCESS)
1288                         goto fail;
1289         }
1290
1291         pam_code = pam_acct_mgmt(handle, flags);
1292         if (pam_code != PAM_SUCCESS)
1293                 goto fail;
1294
1295         pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1296         if (pam_code != PAM_SUCCESS)
1297                 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
1298
1299         pam_code = pam_open_session(handle, flags);
1300         if (pam_code != PAM_SUCCESS)
1301                 goto fail;
1302
1303         close_session = true;
1304
1305         e = pam_getenvlist(handle);
1306         if (!e) {
1307                 pam_code = PAM_BUF_ERR;
1308                 goto fail;
1309         }
1310
1311         /* Block SIGTERM, so that we know that it won't get lost in the child */
1312
1313         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1314
1315         parent_pid = getpid_cached();
1316
1317         r = safe_fork("(sd-pam)", 0, &pam_pid);
1318         if (r < 0)
1319                 goto fail;
1320         if (r == 0) {
1321                 int sig, ret = EXIT_PAM;
1322
1323                 /* The child's job is to reset the PAM session on termination */
1324                 barrier_set_role(&barrier, BARRIER_CHILD);
1325
1326                 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1327                  * those fds are open here that have been opened by PAM. */
1328                 (void) close_many(fds, n_fds);
1329
1330                 /* Drop privileges - we don't need any to pam_close_session and this will make
1331                  * PR_SET_PDEATHSIG work in most cases.  If this fails, ignore the error - but expect sd-pam
1332                  * threads to fail to exit normally */
1333
1334                 r = maybe_setgroups(0, NULL);
1335                 if (r < 0)
1336                         log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1337                 if (setresgid(gid, gid, gid) < 0)
1338                         log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1339                 if (setresuid(uid, uid, uid) < 0)
1340                         log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1341
1342                 (void) ignore_signals(SIGPIPE);
1343
1344                 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1345                  * otherwise the kernel will not allow unprivileged parents kill their privileged children
1346                  * this way. We rely on the control groups kill logic to do the rest for us. */
1347                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1348                         goto child_finish;
1349
1350                 /* Tell the parent that our setup is done. This is especially important regarding dropping
1351                  * privileges. Otherwise, unit setup might race against our setresuid(2) call.
1352                  *
1353                  * If the parent aborted, we'll detect this below, hence ignore return failure here. */
1354                 (void) barrier_place(&barrier);
1355
1356                 /* Check if our parent process might already have died? */
1357                 if (getppid() == parent_pid) {
1358                         sigset_t ss;
1359
1360                         assert_se(sigemptyset(&ss) >= 0);
1361                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1362
1363                         for (;;) {
1364                                 if (sigwait(&ss, &sig) < 0) {
1365                                         if (errno == EINTR)
1366                                                 continue;
1367
1368                                         goto child_finish;
1369                                 }
1370
1371                                 assert(sig == SIGTERM);
1372                                 break;
1373                         }
1374                 }
1375
1376                 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1377                 if (pam_code != PAM_SUCCESS)
1378                         goto child_finish;
1379
1380                 /* If our parent died we'll end the session */
1381                 if (getppid() != parent_pid) {
1382                         pam_code = pam_close_session(handle, flags);
1383                         if (pam_code != PAM_SUCCESS)
1384                                 goto child_finish;
1385                 }
1386
1387                 ret = 0;
1388
1389         child_finish:
1390                 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1391                  * know about this. See pam_end(3) */
1392                 (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
1393                 _exit(ret);
1394         }
1395
1396         barrier_set_role(&barrier, BARRIER_PARENT);
1397
1398         /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1399          * here. */
1400         handle = NULL;
1401
1402         /* Unblock SIGTERM again in the parent */
1403         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1404
1405         /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1406          * this fd around. */
1407         closelog();
1408
1409         /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1410          * recover. However, warn loudly if it happens. */
1411         if (!barrier_place_and_sync(&barrier))
1412                 log_error("PAM initialization failed");
1413
1414         return strv_free_and_replace(*env, e);
1415
1416 fail:
1417         if (pam_code != PAM_SUCCESS) {
1418                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1419                 r = -EPERM;  /* PAM errors do not map to errno */
1420         } else
1421                 log_error_errno(r, "PAM failed: %m");
1422
1423         if (handle) {
1424                 if (close_session)
1425                         pam_code = pam_close_session(handle, flags);
1426
1427                 (void) pam_end(handle, pam_code | flags);
1428         }
1429
1430         closelog();
1431         return r;
1432 #else
1433         return 0;
1434 #endif
1435 }
1436
1437 static void rename_process_from_path(const char *path) {
1438         _cleanup_free_ char *buf = NULL;
1439         const char *p;
1440
1441         assert(path);
1442
1443         /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
1444          * /bin/ps */
1445
1446         if (path_extract_filename(path, &buf) < 0) {
1447                 rename_process("(...)");
1448                 return;
1449         }
1450
1451         size_t l = strlen(buf);
1452         if (l > 8) {
1453                 /* The end of the process name is usually more interesting, since the first bit might just be
1454                  * "systemd-" */
1455                 p = buf + l - 8;
1456                 l = 8;
1457         } else
1458                 p = buf;
1459
1460         char process_name[11];
1461         process_name[0] = '(';
1462         memcpy(process_name+1, p, l);
1463         process_name[1+l] = ')';
1464         process_name[1+l+1] = 0;
1465
1466         rename_process(process_name);
1467 }
1468
1469 static bool context_has_address_families(const ExecContext *c) {
1470         assert(c);
1471
1472         return c->address_families_allow_list ||
1473                 !set_isempty(c->address_families);
1474 }
1475
1476 static bool context_has_syscall_filters(const ExecContext *c) {
1477         assert(c);
1478
1479         return c->syscall_allow_list ||
1480                 !hashmap_isempty(c->syscall_filter);
1481 }
1482
1483 static bool context_has_syscall_logs(const ExecContext *c) {
1484         assert(c);
1485
1486         return c->syscall_log_allow_list ||
1487                 !hashmap_isempty(c->syscall_log);
1488 }
1489
1490 static bool context_has_no_new_privileges(const ExecContext *c) {
1491         assert(c);
1492
1493         if (c->no_new_privileges)
1494                 return true;
1495
1496         if (have_effective_cap(CAP_SYS_ADMIN) > 0) /* if we are privileged, we don't need NNP */
1497                 return false;
1498
1499         /* We need NNP if we have any form of seccomp and are unprivileged */
1500         return c->lock_personality ||
1501                 c->memory_deny_write_execute ||
1502                 c->private_devices ||
1503                 c->protect_clock ||
1504                 c->protect_hostname ||
1505                 c->protect_kernel_tunables ||
1506                 c->protect_kernel_modules ||
1507                 c->protect_kernel_logs ||
1508                 context_has_address_families(c) ||
1509                 exec_context_restrict_namespaces_set(c) ||
1510                 c->restrict_realtime ||
1511                 c->restrict_suid_sgid ||
1512                 !set_isempty(c->syscall_archs) ||
1513                 context_has_syscall_filters(c) ||
1514                 context_has_syscall_logs(c);
1515 }
1516
1517 bool exec_context_has_credentials(const ExecContext *context) {
1518
1519         assert(context);
1520
1521         return !hashmap_isempty(context->set_credentials) ||
1522                 !hashmap_isempty(context->load_credentials) ||
1523                 !set_isempty(context->import_credentials);
1524 }
1525
1526 #if HAVE_SECCOMP
1527
1528 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1529
1530         if (is_seccomp_available())
1531                 return false;
1532
1533         log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1534         return true;
1535 }
1536
1537 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1538         uint32_t negative_action, default_action, action;
1539         int r;
1540
1541         assert(u);
1542         assert(c);
1543
1544         if (!context_has_syscall_filters(c))
1545                 return 0;
1546
1547         if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1548                 return 0;
1549
1550         negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1551
1552         if (c->syscall_allow_list) {
1553                 default_action = negative_action;
1554                 action = SCMP_ACT_ALLOW;
1555         } else {
1556                 default_action = SCMP_ACT_ALLOW;
1557                 action = negative_action;
1558         }
1559
1560         if (needs_ambient_hack) {
1561                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1562                 if (r < 0)
1563                         return r;
1564         }
1565
1566         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1567 }
1568
1569 static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1570 #ifdef SCMP_ACT_LOG
1571         uint32_t default_action, action;
1572 #endif
1573
1574         assert(u);
1575         assert(c);
1576
1577         if (!context_has_syscall_logs(c))
1578                 return 0;
1579
1580 #ifdef SCMP_ACT_LOG
1581         if (skip_seccomp_unavailable(u, "SystemCallLog="))
1582                 return 0;
1583
1584         if (c->syscall_log_allow_list) {
1585                 /* Log nothing but the ones listed */
1586                 default_action = SCMP_ACT_ALLOW;
1587                 action = SCMP_ACT_LOG;
1588         } else {
1589                 /* Log everything but the ones listed */
1590                 default_action = SCMP_ACT_LOG;
1591                 action = SCMP_ACT_ALLOW;
1592         }
1593
1594         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1595 #else
1596         /* old libseccomp */
1597         log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1598         return 0;
1599 #endif
1600 }
1601
1602 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1603         assert(u);
1604         assert(c);
1605
1606         if (set_isempty(c->syscall_archs))
1607                 return 0;
1608
1609         if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1610                 return 0;
1611
1612         return seccomp_restrict_archs(c->syscall_archs);
1613 }
1614
1615 static int apply_address_families(const Unit* u, const ExecContext *c) {
1616         assert(u);
1617         assert(c);
1618
1619         if (!context_has_address_families(c))
1620                 return 0;
1621
1622         if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1623                 return 0;
1624
1625         return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1626 }
1627
1628 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1629         int r;
1630
1631         assert(u);
1632         assert(c);
1633
1634         if (!c->memory_deny_write_execute)
1635                 return 0;
1636
1637         /* use prctl() if kernel supports it (6.3) */
1638         r = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0, 0, 0);
1639         if (r == 0) {
1640                 log_unit_debug(u, "Enabled MemoryDenyWriteExecute= with PR_SET_MDWE");
1641                 return 0;
1642         }
1643         if (r < 0 && errno != EINVAL)
1644                 return log_unit_debug_errno(u, errno, "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m");
1645         /* else use seccomp */
1646         log_unit_debug(u, "Kernel doesn't support PR_SET_MDWE: falling back to seccomp");
1647
1648         if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1649                 return 0;
1650
1651         return seccomp_memory_deny_write_execute();
1652 }
1653
1654 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1655         assert(u);
1656         assert(c);
1657
1658         if (!c->restrict_realtime)
1659                 return 0;
1660
1661         if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1662                 return 0;
1663
1664         return seccomp_restrict_realtime();
1665 }
1666
1667 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1668         assert(u);
1669         assert(c);
1670
1671         if (!c->restrict_suid_sgid)
1672                 return 0;
1673
1674         if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1675                 return 0;
1676
1677         return seccomp_restrict_suid_sgid();
1678 }
1679
1680 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1681         assert(u);
1682         assert(c);
1683
1684         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1685          * let's protect even those systems where this is left on in the kernel. */
1686
1687         if (!c->protect_kernel_tunables)
1688                 return 0;
1689
1690         if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1691                 return 0;
1692
1693         return seccomp_protect_sysctl();
1694 }
1695
1696 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1697         assert(u);
1698         assert(c);
1699
1700         /* Turn off module syscalls on ProtectKernelModules=yes */
1701
1702         if (!c->protect_kernel_modules)
1703                 return 0;
1704
1705         if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1706                 return 0;
1707
1708         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1709 }
1710
1711 static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1712         assert(u);
1713         assert(c);
1714
1715         if (!c->protect_kernel_logs)
1716                 return 0;
1717
1718         if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1719                 return 0;
1720
1721         return seccomp_protect_syslog();
1722 }
1723
1724 static int apply_protect_clock(const Unit *u, const ExecContext *c) {
1725         assert(u);
1726         assert(c);
1727
1728         if (!c->protect_clock)
1729                 return 0;
1730
1731         if (skip_seccomp_unavailable(u, "ProtectClock="))
1732                 return 0;
1733
1734         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1735 }
1736
1737 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1738         assert(u);
1739         assert(c);
1740
1741         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1742
1743         if (!c->private_devices)
1744                 return 0;
1745
1746         if (skip_seccomp_unavailable(u, "PrivateDevices="))
1747                 return 0;
1748
1749         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1750 }
1751
1752 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1753         assert(u);
1754         assert(c);
1755
1756         if (!exec_context_restrict_namespaces_set(c))
1757                 return 0;
1758
1759         if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1760                 return 0;
1761
1762         return seccomp_restrict_namespaces(c->restrict_namespaces);
1763 }
1764
1765 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1766         unsigned long personality;
1767         int r;
1768
1769         assert(u);
1770         assert(c);
1771
1772         if (!c->lock_personality)
1773                 return 0;
1774
1775         if (skip_seccomp_unavailable(u, "LockPersonality="))
1776                 return 0;
1777
1778         personality = c->personality;
1779
1780         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1781         if (personality == PERSONALITY_INVALID) {
1782
1783                 r = opinionated_personality(&personality);
1784                 if (r < 0)
1785                         return r;
1786         }
1787
1788         return seccomp_lock_personality(personality);
1789 }
1790
1791 #endif
1792
1793 #if HAVE_LIBBPF
1794 static int apply_restrict_filesystems(Unit *u, const ExecContext *c) {
1795         assert(u);
1796         assert(c);
1797
1798         if (!exec_context_restrict_filesystems_set(c))
1799                 return 0;
1800
1801         if (!u->manager->restrict_fs) {
1802                 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1803                 log_unit_debug(u, "LSM BPF not supported, skipping RestrictFileSystems=");
1804                 return 0;
1805         }
1806
1807         return lsm_bpf_unit_restrict_filesystems(u, c->restrict_filesystems, c->restrict_filesystems_allow_list);
1808 }
1809 #endif
1810
1811 static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
1812         assert(u);
1813         assert(c);
1814
1815         if (!c->protect_hostname)
1816                 return 0;
1817
1818         if (ns_type_supported(NAMESPACE_UTS)) {
1819                 if (unshare(CLONE_NEWUTS) < 0) {
1820                         if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1821                                 *ret_exit_status = EXIT_NAMESPACE;
1822                                 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1823                         }
1824
1825                         log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1826                 }
1827         } else
1828                 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1829
1830 #if HAVE_SECCOMP
1831         int r;
1832
1833         if (skip_seccomp_unavailable(u, "ProtectHostname="))
1834                 return 0;
1835
1836         r = seccomp_protect_hostname();
1837         if (r < 0) {
1838                 *ret_exit_status = EXIT_SECCOMP;
1839                 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1840         }
1841 #endif
1842
1843         return 0;
1844 }
1845
1846 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1847         assert(idle_pipe);
1848
1849         idle_pipe[1] = safe_close(idle_pipe[1]);
1850         idle_pipe[2] = safe_close(idle_pipe[2]);
1851
1852         if (idle_pipe[0] >= 0) {
1853                 int r;
1854
1855                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1856
1857                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1858                         ssize_t n;
1859
1860                         /* Signal systemd that we are bored and want to continue. */
1861                         n = write(idle_pipe[3], "x", 1);
1862                         if (n > 0)
1863                                 /* Wait for systemd to react to the signal above. */
1864                                 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1865                 }
1866
1867                 idle_pipe[0] = safe_close(idle_pipe[0]);
1868
1869         }
1870
1871         idle_pipe[3] = safe_close(idle_pipe[3]);
1872 }
1873
1874 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1875
1876 static int build_environment(
1877                 const Unit *u,
1878                 const ExecContext *c,
1879                 const ExecParameters *p,
1880                 const CGroupContext *cgroup_context,
1881                 size_t n_fds,
1882                 char **fdnames,
1883                 const char *home,
1884                 const char *username,
1885                 const char *shell,
1886                 dev_t journal_stream_dev,
1887                 ino_t journal_stream_ino,
1888                 const char *memory_pressure_path,
1889                 char ***ret) {
1890
1891         _cleanup_strv_free_ char **our_env = NULL;
1892         size_t n_env = 0;
1893         char *x;
1894         int r;
1895
1896         assert(u);
1897         assert(c);
1898         assert(p);
1899         assert(ret);
1900
1901 #define N_ENV_VARS 19
1902         our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1903         if (!our_env)
1904                 return -ENOMEM;
1905
1906         if (n_fds > 0) {
1907                 _cleanup_free_ char *joined = NULL;
1908
1909                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1910                         return -ENOMEM;
1911                 our_env[n_env++] = x;
1912
1913                 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1914                         return -ENOMEM;
1915                 our_env[n_env++] = x;
1916
1917                 joined = strv_join(fdnames, ":");
1918                 if (!joined)
1919                         return -ENOMEM;
1920
1921                 x = strjoin("LISTEN_FDNAMES=", joined);
1922                 if (!x)
1923                         return -ENOMEM;
1924                 our_env[n_env++] = x;
1925         }
1926
1927         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1928                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1929                         return -ENOMEM;
1930                 our_env[n_env++] = x;
1931
1932                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1933                         return -ENOMEM;
1934                 our_env[n_env++] = x;
1935         }
1936
1937         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1938          * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1939          * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1940         if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
1941                 x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
1942                 if (!x)
1943                         return -ENOMEM;
1944                 our_env[n_env++] = x;
1945         }
1946
1947         if (home) {
1948                 x = strjoin("HOME=", home);
1949                 if (!x)
1950                         return -ENOMEM;
1951
1952                 path_simplify(x + 5);
1953                 our_env[n_env++] = x;
1954         }
1955
1956         if (username) {
1957                 x = strjoin("LOGNAME=", username);
1958                 if (!x)
1959                         return -ENOMEM;
1960                 our_env[n_env++] = x;
1961
1962                 x = strjoin("USER=", username);
1963                 if (!x)
1964                         return -ENOMEM;
1965                 our_env[n_env++] = x;
1966         }
1967
1968         if (shell) {
1969                 x = strjoin("SHELL=", shell);
1970                 if (!x)
1971                         return -ENOMEM;
1972
1973                 path_simplify(x + 6);
1974                 our_env[n_env++] = x;
1975         }
1976
1977         if (!sd_id128_is_null(u->invocation_id)) {
1978                 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1979                         return -ENOMEM;
1980
1981                 our_env[n_env++] = x;
1982         }
1983
1984         if (exec_context_needs_term(c)) {
1985                 _cleanup_free_ char *cmdline = NULL;
1986                 const char *tty_path, *term = NULL;
1987
1988                 tty_path = exec_context_tty_path(c);
1989
1990                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1991                  * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1992                  * container manager passes to PID 1 ends up all the way in the console login shown. */
1993
1994                 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
1995                         term = getenv("TERM");
1996                 else if (tty_path && in_charset(skip_dev_prefix(tty_path), ALPHANUMERICAL)) {
1997                         _cleanup_free_ char *key = NULL;
1998
1999                         key = strjoin("systemd.tty.term.", skip_dev_prefix(tty_path));
2000                         if (!key)
2001                                 return -ENOMEM;
2002
2003                         r = proc_cmdline_get_key(key, 0, &cmdline);
2004                         if (r < 0)
2005                                 log_debug_errno(r, "Failed to read %s from kernel cmdline, ignoring: %m", key);
2006                         else if (r > 0)
2007                                 term = cmdline;
2008                 }
2009
2010                 if (!term)
2011                         term = default_term_for_tty(tty_path);
2012
2013                 x = strjoin("TERM=", term);
2014                 if (!x)
2015                         return -ENOMEM;
2016                 our_env[n_env++] = x;
2017         }
2018
2019         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
2020                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
2021                         return -ENOMEM;
2022
2023                 our_env[n_env++] = x;
2024         }
2025
2026         if (c->log_namespace) {
2027                 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
2028                 if (!x)
2029                         return -ENOMEM;
2030
2031                 our_env[n_env++] = x;
2032         }
2033
2034         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2035                 _cleanup_free_ char *joined = NULL;
2036                 const char *n;
2037
2038                 if (!p->prefix[t])
2039                         continue;
2040
2041                 if (c->directories[t].n_items == 0)
2042                         continue;
2043
2044                 n = exec_directory_env_name_to_string(t);
2045                 if (!n)
2046                         continue;
2047
2048                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
2049                         _cleanup_free_ char *prefixed = NULL;
2050
2051                         prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
2052                         if (!prefixed)
2053                                 return -ENOMEM;
2054
2055                         if (!strextend_with_separator(&joined, ":", prefixed))
2056                                 return -ENOMEM;
2057                 }
2058
2059                 x = strjoin(n, "=", joined);
2060                 if (!x)
2061                         return -ENOMEM;
2062
2063                 our_env[n_env++] = x;
2064         }
2065
2066         if (exec_context_has_credentials(c) && p->prefix[EXEC_DIRECTORY_RUNTIME]) {
2067                 x = strjoin("CREDENTIALS_DIRECTORY=", p->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
2068                 if (!x)
2069                         return -ENOMEM;
2070
2071                 our_env[n_env++] = x;
2072         }
2073
2074         if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
2075                 return -ENOMEM;
2076
2077         our_env[n_env++] = x;
2078
2079         if (memory_pressure_path) {
2080                 x = strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path);
2081                 if (!x)
2082                         return -ENOMEM;
2083
2084                 our_env[n_env++] = x;
2085
2086                 if (cgroup_context && !path_equal(memory_pressure_path, "/dev/null")) {
2087                         _cleanup_free_ char *b = NULL, *e = NULL;
2088
2089                         if (asprintf(&b, "%s " USEC_FMT " " USEC_FMT,
2090                                      MEMORY_PRESSURE_DEFAULT_TYPE,
2091                                      cgroup_context->memory_pressure_threshold_usec == USEC_INFINITY ? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC :
2092                                      CLAMP(cgroup_context->memory_pressure_threshold_usec, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC),
2093                                      MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2094                                 return -ENOMEM;
2095
2096                         if (base64mem(b, strlen(b) + 1, &e) < 0)
2097                                 return -ENOMEM;
2098
2099                         x = strjoin("MEMORY_PRESSURE_WRITE=", e);
2100                         if (!x)
2101                                 return -ENOMEM;
2102
2103                         our_env[n_env++] = x;
2104                 }
2105         }
2106
2107         assert(n_env < N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
2108 #undef N_ENV_VARS
2109
2110         *ret = TAKE_PTR(our_env);
2111
2112         return 0;
2113 }
2114
2115 static int build_pass_environment(const ExecContext *c, char ***ret) {
2116         _cleanup_strv_free_ char **pass_env = NULL;
2117         size_t n_env = 0;
2118
2119         STRV_FOREACH(i, c->pass_environment) {
2120                 _cleanup_free_ char *x = NULL;
2121                 char *v;
2122
2123                 v = getenv(*i);
2124                 if (!v)
2125                         continue;
2126                 x = strjoin(*i, "=", v);
2127                 if (!x)
2128                         return -ENOMEM;
2129
2130                 if (!GREEDY_REALLOC(pass_env, n_env + 2))
2131                         return -ENOMEM;
2132
2133                 pass_env[n_env++] = TAKE_PTR(x);
2134                 pass_env[n_env] = NULL;
2135         }
2136
2137         *ret = TAKE_PTR(pass_env);
2138
2139         return 0;
2140 }
2141
2142 bool exec_needs_network_namespace(const ExecContext *context) {
2143         assert(context);
2144
2145         return context->private_network || context->network_namespace_path;
2146 }
2147
2148 static bool exec_needs_ephemeral(const ExecContext *context) {
2149         return (context->root_image || context->root_directory) && context->root_ephemeral;
2150 }
2151
2152 static bool exec_needs_ipc_namespace(const ExecContext *context) {
2153         assert(context);
2154
2155         return context->private_ipc || context->ipc_namespace_path;
2156 }
2157
2158 bool exec_needs_mount_namespace(
2159                 const ExecContext *context,
2160                 const ExecParameters *params,
2161                 const ExecRuntime *runtime) {
2162
2163         assert(context);
2164
2165         if (context->root_image)
2166                 return true;
2167
2168         if (!strv_isempty(context->read_write_paths) ||
2169             !strv_isempty(context->read_only_paths) ||
2170             !strv_isempty(context->inaccessible_paths) ||
2171             !strv_isempty(context->exec_paths) ||
2172             !strv_isempty(context->no_exec_paths))
2173                 return true;
2174
2175         if (context->n_bind_mounts > 0)
2176                 return true;
2177
2178         if (context->n_temporary_filesystems > 0)
2179                 return true;
2180
2181         if (context->n_mount_images > 0)
2182                 return true;
2183
2184         if (context->n_extension_images > 0)
2185                 return true;
2186
2187         if (!strv_isempty(context->extension_directories))
2188                 return true;
2189
2190         if (!IN_SET(context->mount_propagation_flag, 0, MS_SHARED))
2191                 return true;
2192
2193         if (context->private_tmp && runtime && runtime->shared && (runtime->shared->tmp_dir || runtime->shared->var_tmp_dir))
2194                 return true;
2195
2196         if (context->private_devices ||
2197             context->private_mounts > 0 ||
2198             (context->private_mounts < 0 && exec_needs_network_namespace(context)) ||
2199             context->protect_system != PROTECT_SYSTEM_NO ||
2200             context->protect_home != PROTECT_HOME_NO ||
2201             context->protect_kernel_tunables ||
2202             context->protect_kernel_modules ||
2203             context->protect_kernel_logs ||
2204             context->protect_control_groups ||
2205             context->protect_proc != PROTECT_PROC_DEFAULT ||
2206             context->proc_subset != PROC_SUBSET_ALL ||
2207             exec_needs_ipc_namespace(context))
2208                 return true;
2209
2210         if (context->root_directory) {
2211                 if (exec_context_get_effective_mount_apivfs(context))
2212                         return true;
2213
2214                 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2215                         if (params && !params->prefix[t])
2216                                 continue;
2217
2218                         if (context->directories[t].n_items > 0)
2219                                 return true;
2220                 }
2221         }
2222
2223         if (context->dynamic_user &&
2224             (context->directories[EXEC_DIRECTORY_STATE].n_items > 0 ||
2225              context->directories[EXEC_DIRECTORY_CACHE].n_items > 0 ||
2226              context->directories[EXEC_DIRECTORY_LOGS].n_items > 0))
2227                 return true;
2228
2229         if (context->log_namespace)
2230                 return true;
2231
2232         return false;
2233 }
2234
2235 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
2236         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2237         _cleanup_close_pair_ int errno_pipe[2] = PIPE_EBADF;
2238         _cleanup_close_ int unshare_ready_fd = -EBADF;
2239         _cleanup_(sigkill_waitp) pid_t pid = 0;
2240         uint64_t c = 1;
2241         ssize_t n;
2242         int r;
2243
2244         /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2245          * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2246          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2247          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2248          * which waits for the parent to create the new user namespace while staying in the original namespace. The
2249          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2250          * continues execution normally.
2251          * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2252          * does not need CAP_SETUID to write the single line mapping to itself. */
2253
2254         /* Can only set up multiple mappings with CAP_SETUID. */
2255         if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid))
2256                 r = asprintf(&uid_map,
2257                              UID_FMT " " UID_FMT " 1\n"     /* Map $OUID → $OUID */
2258                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
2259                              ouid, ouid, uid, uid);
2260         else
2261                 r = asprintf(&uid_map,
2262                              UID_FMT " " UID_FMT " 1\n",    /* Map $OUID → $OUID */
2263                              ouid, ouid);
2264
2265         if (r < 0)
2266                 return -ENOMEM;
2267
2268         /* Can only set up multiple mappings with CAP_SETGID. */
2269         if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid))
2270                 r = asprintf(&gid_map,
2271                              GID_FMT " " GID_FMT " 1\n"     /* Map $OGID → $OGID */
2272                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
2273                              ogid, ogid, gid, gid);
2274         else
2275                 r = asprintf(&gid_map,
2276                              GID_FMT " " GID_FMT " 1\n",    /* Map $OGID -> $OGID */
2277                              ogid, ogid);
2278
2279         if (r < 0)
2280                 return -ENOMEM;
2281
2282         /* Create a communication channel so that the parent can tell the child when it finished creating the user
2283          * namespace. */
2284         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2285         if (unshare_ready_fd < 0)
2286                 return -errno;
2287
2288         /* Create a communication channel so that the child can tell the parent a proper error code in case it
2289          * failed. */
2290         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2291                 return -errno;
2292
2293         r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2294         if (r < 0)
2295                 return r;
2296         if (r == 0) {
2297                 _cleanup_close_ int fd = -EBADF;
2298                 const char *a;
2299                 pid_t ppid;
2300
2301                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2302                  * here, after the parent opened its own user namespace. */
2303
2304                 ppid = getppid();
2305                 errno_pipe[0] = safe_close(errno_pipe[0]);
2306
2307                 /* Wait until the parent unshared the user namespace */
2308                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2309                         r = -errno;
2310                         goto child_fail;
2311                 }
2312
2313                 /* Disable the setgroups() system call in the child user namespace, for good. */
2314                 a = procfs_file_alloca(ppid, "setgroups");
2315                 fd = open(a, O_WRONLY|O_CLOEXEC);
2316                 if (fd < 0) {
2317                         if (errno != ENOENT) {
2318                                 r = -errno;
2319                                 goto child_fail;
2320                         }
2321
2322                         /* If the file is missing the kernel is too old, let's continue anyway. */
2323                 } else {
2324                         if (write(fd, "deny\n", 5) < 0) {
2325                                 r = -errno;
2326                                 goto child_fail;
2327                         }
2328
2329                         fd = safe_close(fd);
2330                 }
2331
2332                 /* First write the GID map */
2333                 a = procfs_file_alloca(ppid, "gid_map");
2334                 fd = open(a, O_WRONLY|O_CLOEXEC);
2335                 if (fd < 0) {
2336                         r = -errno;
2337                         goto child_fail;
2338                 }
2339                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2340                         r = -errno;
2341                         goto child_fail;
2342                 }
2343                 fd = safe_close(fd);
2344
2345                 /* The write the UID map */
2346                 a = procfs_file_alloca(ppid, "uid_map");
2347                 fd = open(a, O_WRONLY|O_CLOEXEC);
2348                 if (fd < 0) {
2349                         r = -errno;
2350                         goto child_fail;
2351                 }
2352                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2353                         r = -errno;
2354                         goto child_fail;
2355                 }
2356
2357                 _exit(EXIT_SUCCESS);
2358
2359         child_fail:
2360                 (void) write(errno_pipe[1], &r, sizeof(r));
2361                 _exit(EXIT_FAILURE);
2362         }
2363
2364         errno_pipe[1] = safe_close(errno_pipe[1]);
2365
2366         if (unshare(CLONE_NEWUSER) < 0)
2367                 return -errno;
2368
2369         /* Let the child know that the namespace is ready now */
2370         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2371                 return -errno;
2372
2373         /* Try to read an error code from the child */
2374         n = read(errno_pipe[0], &r, sizeof(r));
2375         if (n < 0)
2376                 return -errno;
2377         if (n == sizeof(r)) { /* an error code was sent to us */
2378                 if (r < 0)
2379                         return r;
2380                 return -EIO;
2381         }
2382         if (n != 0) /* on success we should have read 0 bytes */
2383                 return -EIO;
2384
2385         r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
2386         if (r < 0)
2387                 return r;
2388         if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2389                 return -EIO;
2390
2391         return 0;
2392 }
2393
2394 static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2395         assert(context);
2396
2397         if (!context->dynamic_user)
2398                 return false;
2399
2400         if (type == EXEC_DIRECTORY_CONFIGURATION)
2401                 return false;
2402
2403         if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2404                 return false;
2405
2406         return true;
2407 }
2408
2409 static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2410         _cleanup_free_ char *src_abs = NULL;
2411         int r;
2412
2413         assert(source);
2414
2415         src_abs = path_join(root, source);
2416         if (!src_abs)
2417                 return -ENOMEM;
2418
2419         STRV_FOREACH(dst, symlinks) {
2420                 _cleanup_free_ char *dst_abs = NULL;
2421
2422                 dst_abs = path_join(root, *dst);
2423                 if (!dst_abs)
2424                         return -ENOMEM;
2425
2426                 r = mkdir_parents_label(dst_abs, 0755);
2427                 if (r < 0)
2428                         return r;
2429
2430                 r = symlink_idempotent(src_abs, dst_abs, true);
2431                 if (r < 0)
2432                         return r;
2433         }
2434
2435         return 0;
2436 }
2437
2438 static int setup_exec_directory(
2439                 Unit *u,
2440                 const ExecContext *context,
2441                 const ExecParameters *params,
2442                 uid_t uid,
2443                 gid_t gid,
2444                 ExecDirectoryType type,
2445                 bool needs_mount_namespace,
2446                 int *exit_status) {
2447
2448         static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2449                 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2450                 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2451                 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2452                 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2453                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2454         };
2455         int r;
2456
2457         assert(context);
2458         assert(params);
2459         assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2460         assert(exit_status);
2461
2462         if (!params->prefix[type])
2463                 return 0;
2464
2465         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2466                 if (!uid_is_valid(uid))
2467                         uid = 0;
2468                 if (!gid_is_valid(gid))
2469                         gid = 0;
2470         }
2471
2472         for (size_t i = 0; i < context->directories[type].n_items; i++) {
2473                 _cleanup_free_ char *p = NULL, *pp = NULL;
2474
2475                 p = path_join(params->prefix[type], context->directories[type].items[i].path);
2476                 if (!p) {
2477                         r = -ENOMEM;
2478                         goto fail;
2479                 }
2480
2481                 r = mkdir_parents_label(p, 0755);
2482                 if (r < 0)
2483                         goto fail;
2484
2485                 if (IN_SET(type, EXEC_DIRECTORY_STATE, EXEC_DIRECTORY_LOGS) && params->runtime_scope == RUNTIME_SCOPE_USER) {
2486
2487                         /* If we are in user mode, and a configuration directory exists but a state directory
2488                          * doesn't exist, then we likely are upgrading from an older systemd version that
2489                          * didn't know the more recent addition to the xdg-basedir spec: the $XDG_STATE_HOME
2490                          * directory. In older systemd versions EXEC_DIRECTORY_STATE was aliased to
2491                          * EXEC_DIRECTORY_CONFIGURATION, with the advent of $XDG_STATE_HOME is is now
2492                          * separated. If a service has both dirs configured but only the configuration dir
2493                          * exists and the state dir does not, we assume we are looking at an update
2494                          * situation. Hence, create a compatibility symlink, so that all expectations are
2495                          * met.
2496                          *
2497                          * (We also do something similar with the log directory, which still doesn't exist in
2498                          * the xdg basedir spec. We'll make it a subdir of the state dir.) */
2499
2500                         /* this assumes the state dir is always created before the configuration dir */
2501                         assert_cc(EXEC_DIRECTORY_STATE < EXEC_DIRECTORY_LOGS);
2502                         assert_cc(EXEC_DIRECTORY_LOGS < EXEC_DIRECTORY_CONFIGURATION);
2503
2504                         r = laccess(p, F_OK);
2505                         if (r == -ENOENT) {
2506                                 _cleanup_free_ char *q = NULL;
2507
2508                                 /* OK, we know that the state dir does not exist. Let's see if the dir exists
2509                                  * under the configuration hierarchy. */
2510
2511                                 if (type == EXEC_DIRECTORY_STATE)
2512                                         q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], context->directories[type].items[i].path);
2513                                 else if (type == EXEC_DIRECTORY_LOGS)
2514                                         q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], "log", context->directories[type].items[i].path);
2515                                 else
2516                                         assert_not_reached();
2517                                 if (!q) {
2518                                         r = -ENOMEM;
2519                                         goto fail;
2520                                 }
2521
2522                                 r = laccess(q, F_OK);
2523                                 if (r >= 0) {
2524                                         /* It does exist! This hence looks like an update. Symlink the
2525                                          * configuration directory into the state directory. */
2526
2527                                         r = symlink_idempotent(q, p, /* make_relative= */ true);
2528                                         if (r < 0)
2529                                                 goto fail;
2530
2531                                         log_unit_notice(u, "Unit state directory %s missing but matching configuration directory %s exists, assuming update from systemd 253 or older, creating compatibility symlink.", p, q);
2532                                         continue;
2533                                 } else if (r != -ENOENT)
2534                                         log_unit_warning_errno(u, r, "Unable to detect whether unit configuration directory '%s' exists, assuming not: %m", q);
2535
2536                         } else if (r < 0)
2537                                 log_unit_warning_errno(u, r, "Unable to detect whether unit state directory '%s' is missing, assuming it is: %m", p);
2538                 }
2539
2540                 if (exec_directory_is_private(context, type)) {
2541                         /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2542                          * case we want to avoid leaving a directory around fully accessible that is owned by
2543                          * a dynamic user whose UID is later on reused. To lock this down we use the same
2544                          * trick used by container managers to prohibit host users to get access to files of
2545                          * the same UID in containers: we place everything inside a directory that has an
2546                          * access mode of 0700 and is owned root:root, so that it acts as security boundary
2547                          * for unprivileged host code. We then use fs namespacing to make this directory
2548                          * permeable for the service itself.
2549                          *
2550                          * Specifically: for a service which wants a special directory "foo/" we first create
2551                          * a directory "private/" with access mode 0700 owned by root:root. Then we place
2552                          * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2553                          * "private/foo". This way, privileged host users can access "foo/" as usual, but
2554                          * unprivileged host users can't look into it. Inside of the namespace of the unit
2555                          * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2556                          * "private/foo/" is mounted under the same name, thus disabling the access boundary
2557                          * for the service and making sure it only gets access to the dirs it needs but no
2558                          * others. Tricky? Yes, absolutely, but it works!
2559                          *
2560                          * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2561                          * to be owned by the service itself.
2562                          *
2563                          * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2564                          * for sharing files or sockets with other services. */
2565
2566                         pp = path_join(params->prefix[type], "private");
2567                         if (!pp) {
2568                                 r = -ENOMEM;
2569                                 goto fail;
2570                         }
2571
2572                         /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2573                         r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
2574                         if (r < 0)
2575                                 goto fail;
2576
2577                         if (!path_extend(&pp, context->directories[type].items[i].path)) {
2578                                 r = -ENOMEM;
2579                                 goto fail;
2580                         }
2581
2582                         /* Create all directories between the configured directory and this private root, and mark them 0755 */
2583                         r = mkdir_parents_label(pp, 0755);
2584                         if (r < 0)
2585                                 goto fail;
2586
2587                         if (is_dir(p, false) > 0 &&
2588                             (laccess(pp, F_OK) == -ENOENT)) {
2589
2590                                 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2591                                  * it over. Most likely the service has been upgraded from one that didn't use
2592                                  * DynamicUser=1, to one that does. */
2593
2594                                 log_unit_info(u, "Found pre-existing public %s= directory %s, migrating to %s.\n"
2595                                               "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2596                                               exec_directory_type_to_string(type), p, pp);
2597
2598                                 r = RET_NERRNO(rename(p, pp));
2599                                 if (r < 0)
2600                                         goto fail;
2601                         } else {
2602                                 /* Otherwise, create the actual directory for the service */
2603
2604                                 r = mkdir_label(pp, context->directories[type].mode);
2605                                 if (r < 0 && r != -EEXIST)
2606                                         goto fail;
2607                         }
2608
2609                         if (!context->directories[type].items[i].only_create) {
2610                                 /* And link it up from the original place.
2611                                  * Notes
2612                                  * 1) If a mount namespace is going to be used, then this symlink remains on
2613                                  *    the host, and a new one for the child namespace will be created later.
2614                                  * 2) It is not necessary to create this symlink when one of its parent
2615                                  *    directories is specified and already created. E.g.
2616                                  *        StateDirectory=foo foo/bar
2617                                  *    In that case, the inode points to pp and p for "foo/bar" are the same:
2618                                  *        pp = "/var/lib/private/foo/bar"
2619                                  *        p = "/var/lib/foo/bar"
2620                                  *    and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2621                                  *    we do not need to create the symlink, but we cannot create the symlink.
2622                                  *    See issue #24783. */
2623                                 r = symlink_idempotent(pp, p, true);
2624                                 if (r < 0)
2625                                         goto fail;
2626                         }
2627
2628                 } else {
2629                         _cleanup_free_ char *target = NULL;
2630
2631                         if (type != EXEC_DIRECTORY_CONFIGURATION &&
2632                             readlink_and_make_absolute(p, &target) >= 0) {
2633                                 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2634
2635                                 /* This already exists and is a symlink? Interesting. Maybe it's one created
2636                                  * by DynamicUser=1 (see above)?
2637                                  *
2638                                  * We do this for all directory types except for ConfigurationDirectory=,
2639                                  * since they all support the private/ symlink logic at least in some
2640                                  * configurations, see above. */
2641
2642                                 r = chase(target, NULL, 0, &target_resolved, NULL);
2643                                 if (r < 0)
2644                                         goto fail;
2645
2646                                 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
2647                                 if (!q) {
2648                                         r = -ENOMEM;
2649                                         goto fail;
2650                                 }
2651
2652                                 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2653                                 r = chase(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2654                                 if (r < 0)
2655                                         goto fail;
2656
2657                                 if (path_equal(q_resolved, target_resolved)) {
2658
2659                                         /* Hmm, apparently DynamicUser= was once turned on for this service,
2660                                          * but is no longer. Let's move the directory back up. */
2661
2662                                         log_unit_info(u, "Found pre-existing private %s= directory %s, migrating to %s.\n"
2663                                                       "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2664                                                       exec_directory_type_to_string(type), q, p);
2665
2666                                         r = RET_NERRNO(unlink(p));
2667                                         if (r < 0)
2668                                                 goto fail;
2669
2670                                         r = RET_NERRNO(rename(q, p));
2671                                         if (r < 0)
2672                                                 goto fail;
2673                                 }
2674                         }
2675
2676                         r = mkdir_label(p, context->directories[type].mode);
2677                         if (r < 0) {
2678                                 if (r != -EEXIST)
2679                                         goto fail;
2680
2681                                 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2682                                         struct stat st;
2683
2684                                         /* Don't change the owner/access mode of the configuration directory,
2685                                          * as in the common case it is not written to by a service, and shall
2686                                          * not be writable. */
2687
2688                                         r = RET_NERRNO(stat(p, &st));
2689                                         if (r < 0)
2690                                                 goto fail;
2691
2692                                         /* Still complain if the access mode doesn't match */
2693                                         if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2694                                                 log_unit_warning(u, "%s \'%s\' already exists but the mode is different. "
2695                                                                  "(File system: %o %sMode: %o)",
2696                                                                  exec_directory_type_to_string(type), context->directories[type].items[i].path,
2697                                                                  st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2698
2699                                         continue;
2700                                 }
2701                         }
2702                 }
2703
2704                 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2705                  * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2706                  * current UID/GID ownership.) */
2707                 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2708                 if (r < 0)
2709                         goto fail;
2710
2711                 /* Skip the rest (which deals with ownership) in user mode, since ownership changes are not
2712                  * available to user code anyway */
2713                 if (params->runtime_scope != RUNTIME_SCOPE_SYSTEM)
2714                         continue;
2715
2716                 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2717                  * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2718                  * assignments to exist. */
2719                 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777, AT_SYMLINK_FOLLOW);
2720                 if (r < 0)
2721                         goto fail;
2722         }
2723
2724         /* If we are not going to run in a namespace, set up the symlinks - otherwise
2725          * they are set up later, to allow configuring empty var/run/etc. */
2726         if (!needs_mount_namespace)
2727                 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2728                         r = create_many_symlinks(params->prefix[type],
2729                                                  context->directories[type].items[i].path,
2730                                                  context->directories[type].items[i].symlinks);
2731                         if (r < 0)
2732                                 goto fail;
2733                 }
2734
2735         return 0;
2736
2737 fail:
2738         *exit_status = exit_status_table[type];
2739         return r;
2740 }
2741
2742 static int write_credential(
2743                 int dfd,
2744                 const char *id,
2745                 const void *data,
2746                 size_t size,
2747                 uid_t uid,
2748                 bool ownership_ok) {
2749
2750         _cleanup_(unlink_and_freep) char *tmp = NULL;
2751         _cleanup_close_ int fd = -EBADF;
2752         int r;
2753
2754         r = tempfn_random_child("", "cred", &tmp);
2755         if (r < 0)
2756                 return r;
2757
2758         fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
2759         if (fd < 0) {
2760                 tmp = mfree(tmp);
2761                 return -errno;
2762         }
2763
2764         r = loop_write(fd, data, size, /* do_poll = */ false);
2765         if (r < 0)
2766                 return r;
2767
2768         if (fchmod(fd, 0400) < 0) /* Take away "w" bit */
2769                 return -errno;
2770
2771         if (uid_is_valid(uid) && uid != getuid()) {
2772                 r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
2773                 if (r < 0) {
2774                         if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2775                                 return r;
2776
2777                         if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want
2778                                             * to express: that the user gets read access and nothing
2779                                             * else. But if the backing fs can't support that (e.g. ramfs)
2780                                             * then we can use file ownership instead. But that's only safe if
2781                                             * we can then re-mount the whole thing read-only, so that the
2782                                             * user can no longer chmod() the file to gain write access. */
2783                                 return r;
2784
2785                         if (fchown(fd, uid, GID_INVALID) < 0)
2786                                 return -errno;
2787                 }
2788         }
2789
2790         if (renameat(dfd, tmp, dfd, id) < 0)
2791                 return -errno;
2792
2793         tmp = mfree(tmp);
2794         return 0;
2795 }
2796
2797 typedef enum CredentialSearchPath {
2798         CREDENTIAL_SEARCH_PATH_TRUSTED,
2799         CREDENTIAL_SEARCH_PATH_ENCRYPTED,
2800         CREDENTIAL_SEARCH_PATH_ALL,
2801         _CREDENTIAL_SEARCH_PATH_MAX,
2802         _CREDENTIAL_SEARCH_PATH_INVALID = -EINVAL,
2803 } CredentialSearchPath;
2804
2805 static char **credential_search_path(const ExecParameters *params, CredentialSearchPath path) {
2806
2807         _cleanup_strv_free_ char **l = NULL;
2808
2809         assert(params);
2810         assert(path >= 0 && path < _CREDENTIAL_SEARCH_PATH_MAX);
2811
2812         /* Assemble a search path to find credentials in. For non-encrypted credentials, We'll look in
2813          * /etc/credstore/ (and similar directories in /usr/lib/ + /run/). If we're looking for encrypted
2814          * credentials, we'll look in /etc/credstore.encrypted/ (and similar dirs). */
2815
2816         if (IN_SET(path, CREDENTIAL_SEARCH_PATH_ENCRYPTED, CREDENTIAL_SEARCH_PATH_ALL)) {
2817                 if (strv_extend(&l, params->received_encrypted_credentials_directory) < 0)
2818                         return NULL;
2819
2820                 if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore.encrypted"), /* filter_duplicates= */ true) < 0)
2821                         return NULL;
2822         }
2823
2824         if (IN_SET(path, CREDENTIAL_SEARCH_PATH_TRUSTED, CREDENTIAL_SEARCH_PATH_ALL)) {
2825                 if (params->received_credentials_directory)
2826                         if (strv_extend(&l, params->received_credentials_directory) < 0)
2827                                 return NULL;
2828
2829                 if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore"), /* filter_duplicates= */ true) < 0)
2830                         return NULL;
2831         }
2832
2833         if (DEBUG_LOGGING) {
2834                 _cleanup_free_ char *t = strv_join(l, ":");
2835
2836                 log_debug("Credential search path is: %s", strempty(t));
2837         }
2838
2839         return TAKE_PTR(l);
2840 }
2841
2842 static int maybe_decrypt_and_write_credential(
2843                 int dir_fd,
2844                 const char *id,
2845                 bool encrypted,
2846                 uid_t uid,
2847                 bool ownership_ok,
2848                 const char *data,
2849                 size_t size,
2850                 uint64_t *left) {
2851
2852         _cleanup_free_ void *plaintext = NULL;
2853         size_t add;
2854         int r;
2855
2856         if (encrypted) {
2857                 size_t plaintext_size = 0;
2858
2859                 r = decrypt_credential_and_warn(id, now(CLOCK_REALTIME), NULL, NULL, data, size,
2860                                                 &plaintext, &plaintext_size);
2861                 if (r < 0)
2862                         return r;
2863
2864                 data = plaintext;
2865                 size = plaintext_size;
2866         }
2867
2868         add = strlen(id) + size;
2869         if (add > *left)
2870                 return -E2BIG;
2871
2872         r = write_credential(dir_fd, id, data, size, uid, ownership_ok);
2873         if (r < 0)
2874                 return log_debug_errno(r, "Failed to write credential '%s': %m", id);
2875
2876         *left -= add;
2877         return 0;
2878 }
2879
2880 static int load_credential_glob(
2881                 const char *path,
2882                 bool encrypted,
2883                 char **search_path,
2884                 ReadFullFileFlags flags,
2885                 int write_dfd,
2886                 uid_t uid,
2887                 bool ownership_ok,
2888                 uint64_t *left) {
2889
2890         int r;
2891
2892         STRV_FOREACH(d, search_path) {
2893                 _cleanup_globfree_ glob_t pglob = {};
2894                 _cleanup_free_ char *j = NULL;
2895
2896                 j = path_join(*d, path);
2897                 if (!j)
2898                         return -ENOMEM;
2899
2900                 r = safe_glob(j, 0, &pglob);
2901                 if (r == -ENOENT)
2902                         continue;
2903                 if (r < 0)
2904                         return r;
2905
2906                 for (size_t n = 0; n < pglob.gl_pathc; n++) {
2907                         _cleanup_free_ char *fn = NULL;
2908                         _cleanup_(erase_and_freep) char *data = NULL;
2909                         size_t size;
2910
2911                         /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
2912                         r = read_full_file_full(
2913                                 AT_FDCWD,
2914                                 pglob.gl_pathv[n],
2915                                 UINT64_MAX,
2916                                 encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX,
2917                                 flags,
2918                                 NULL,
2919                                 &data, &size);
2920                         if (r < 0)
2921                                 return log_debug_errno(r, "Failed to read credential '%s': %m",
2922                                                         pglob.gl_pathv[n]);
2923
2924                         r = path_extract_filename(pglob.gl_pathv[n], &fn);
2925                         if (r < 0)
2926                                 return log_debug_errno(r, "Failed to extract filename from '%s': %m",
2927                                                         pglob.gl_pathv[n]);
2928
2929                         r = maybe_decrypt_and_write_credential(
2930                                 write_dfd,
2931                                 fn,
2932                                 encrypted,
2933                                 uid,
2934                                 ownership_ok,
2935                                 data, size,
2936                                 left);
2937                         if (r == -EEXIST)
2938                                 continue;
2939                         if (r < 0)
2940                                 return r;
2941                 }
2942         }
2943
2944         return 0;
2945 }
2946
2947 static int load_credential(
2948                 const ExecContext *context,
2949                 const ExecParameters *params,
2950                 const char *id,
2951                 const char *path,
2952                 bool encrypted,
2953                 const char *unit,
2954                 int read_dfd,
2955                 int write_dfd,
2956                 uid_t uid,
2957                 bool ownership_ok,
2958                 uint64_t *left) {
2959
2960         ReadFullFileFlags flags = READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER;
2961         _cleanup_strv_free_ char **search_path = NULL;
2962         _cleanup_(erase_and_freep) char *data = NULL;
2963         _cleanup_free_ char *bindname = NULL;
2964         const char *source = NULL;
2965         bool missing_ok = true;
2966         size_t size, maxsz;
2967         int r;
2968
2969         assert(context);
2970         assert(params);
2971         assert(id);
2972         assert(path);
2973         assert(unit);
2974         assert(read_dfd >= 0 || read_dfd == AT_FDCWD);
2975         assert(write_dfd >= 0);
2976         assert(left);
2977
2978         if (read_dfd >= 0) {
2979                 /* If a directory fd is specified, then read the file directly from that dir. In this case we
2980                  * won't do AF_UNIX stuff (we simply don't want to recursively iterate down a tree of AF_UNIX
2981                  * IPC sockets). It's OK if a file vanishes here in the time we enumerate it and intend to
2982                  * open it. */
2983
2984                 if (!filename_is_valid(path)) /* safety check */
2985                         return -EINVAL;
2986
2987                 missing_ok = true;
2988                 source = path;
2989
2990         } else if (path_is_absolute(path)) {
2991                 /* If this is an absolute path, read the data directly from it, and support AF_UNIX
2992                  * sockets */
2993
2994                 if (!path_is_valid(path)) /* safety check */
2995                         return -EINVAL;
2996
2997                 flags |= READ_FULL_FILE_CONNECT_SOCKET;
2998
2999                 /* Pass some minimal info about the unit and the credential name we are looking to acquire
3000                  * via the source socket address in case we read off an AF_UNIX socket. */
3001                 if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, id) < 0)
3002                         return -ENOMEM;
3003
3004                 missing_ok = false;
3005                 source = path;
3006
3007         } else if (credential_name_valid(path)) {
3008                 /* If this is a relative path, take it as credential name relative to the credentials
3009                  * directory we received ourselves. We don't support the AF_UNIX stuff in this mode, since we
3010                  * are operating on a credential store, i.e. this is guaranteed to be regular files. */
3011
3012                 search_path = credential_search_path(params, CREDENTIAL_SEARCH_PATH_ALL);
3013                 if (!search_path)
3014                         return -ENOMEM;
3015
3016                 missing_ok = true;
3017         } else
3018                 source = NULL;
3019
3020         if (encrypted)
3021                 flags |= READ_FULL_FILE_UNBASE64;
3022
3023         maxsz = encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX;
3024
3025         if (search_path) {
3026                 STRV_FOREACH(d, search_path) {
3027                         _cleanup_free_ char *j = NULL;
3028
3029                         j = path_join(*d, path);
3030                         if (!j)
3031                                 return -ENOMEM;
3032
3033                         r = read_full_file_full(
3034                                         AT_FDCWD, j, /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
3035                                         UINT64_MAX,
3036                                         maxsz,
3037                                         flags,
3038                                         NULL,
3039                                         &data, &size);
3040                         if (r != -ENOENT)
3041                                 break;
3042                 }
3043         } else if (source)
3044                 r = read_full_file_full(
3045                                 read_dfd, source,
3046                                 UINT64_MAX,
3047                                 maxsz,
3048                                 flags,
3049                                 bindname,
3050                                 &data, &size);
3051         else
3052                 r = -ENOENT;
3053
3054         if (r == -ENOENT && (missing_ok || hashmap_contains(context->set_credentials, id))) {
3055                 /* Make a missing inherited credential non-fatal, let's just continue. After all apps
3056                  * will get clear errors if we don't pass such a missing credential on as they
3057                  * themselves will get ENOENT when trying to read them, which should not be much
3058                  * worse than when we handle the error here and make it fatal.
3059                  *
3060                  * Also, if the source file doesn't exist, but a fallback is set via SetCredentials=
3061                  * we are fine, too. */
3062                 log_debug_errno(r, "Couldn't read inherited credential '%s', skipping: %m", path);
3063                 return 0;
3064         }
3065         if (r < 0)
3066                 return log_debug_errno(r, "Failed to read credential '%s': %m", path);
3067
3068         return maybe_decrypt_and_write_credential(write_dfd, id, encrypted, uid, ownership_ok, data, size, left);
3069 }
3070
3071 struct load_cred_args {
3072         const ExecContext *context;
3073         const ExecParameters *params;
3074         bool encrypted;
3075         const char *unit;
3076         int dfd;
3077         uid_t uid;
3078         bool ownership_ok;
3079         uint64_t *left;
3080 };
3081
3082 static int load_cred_recurse_dir_cb(
3083                 RecurseDirEvent event,
3084                 const char *path,
3085                 int dir_fd,
3086                 int inode_fd,
3087                 const struct dirent *de,
3088                 const struct statx *sx,
3089                 void *userdata) {
3090
3091         struct load_cred_args *args = ASSERT_PTR(userdata);
3092         _cleanup_free_ char *sub_id = NULL;
3093         int r;
3094
3095         if (event != RECURSE_DIR_ENTRY)
3096                 return RECURSE_DIR_CONTINUE;
3097
3098         if (!IN_SET(de->d_type, DT_REG, DT_SOCK))
3099                 return RECURSE_DIR_CONTINUE;
3100
3101         sub_id = strreplace(path, "/", "_");
3102         if (!sub_id)
3103                 return -ENOMEM;
3104
3105         if (!credential_name_valid(sub_id))
3106                 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Credential would get ID %s, which is not valid, refusing", sub_id);
3107
3108         if (faccessat(args->dfd, sub_id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0) {
3109                 log_debug("Skipping credential with duplicated ID %s at %s", sub_id, path);
3110                 return RECURSE_DIR_CONTINUE;
3111         }
3112         if (errno != ENOENT)
3113                 return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sub_id);
3114
3115         r = load_credential(
3116                         args->context,
3117                         args->params,
3118                         sub_id,
3119                         de->d_name,
3120                         args->encrypted,
3121                         args->unit,
3122                         dir_fd,
3123                         args->dfd,
3124                         args->uid,
3125                         args->ownership_ok,
3126                         args->left);
3127         if (r < 0)
3128                 return r;
3129
3130         return RECURSE_DIR_CONTINUE;
3131 }
3132
3133 static int acquire_credentials(
3134                 const ExecContext *context,
3135                 const ExecParameters *params,
3136                 const char *unit,
3137                 const char *p,
3138                 uid_t uid,
3139                 bool ownership_ok) {
3140
3141         uint64_t left = CREDENTIALS_TOTAL_SIZE_MAX;
3142         _cleanup_close_ int dfd = -EBADF;
3143         const char *ic;
3144         ExecLoadCredential *lc;
3145         ExecSetCredential *sc;
3146         int r;
3147
3148         assert(context);
3149         assert(p);
3150
3151         dfd = open(p, O_DIRECTORY|O_CLOEXEC);
3152         if (dfd < 0)
3153                 return -errno;
3154
3155         r = fd_acl_make_writable(dfd); /* Add the "w" bit, if we are reusing an already set up credentials dir where it was unset */
3156         if (r < 0)
3157                 return r;
3158
3159         /* First, load credentials off disk (or acquire via AF_UNIX socket) */
3160         HASHMAP_FOREACH(lc, context->load_credentials) {
3161                 _cleanup_close_ int sub_fd = -EBADF;
3162
3163                 /* If this is an absolute path, then try to open it as a directory. If that works, then we'll
3164                  * recurse into it. If it is an absolute path but it isn't a directory, then we'll open it as
3165                  * a regular file. Finally, if it's a relative path we will use it as a credential name to
3166                  * propagate a credential passed to us from further up. */
3167
3168                 if (path_is_absolute(lc->path)) {
3169                         sub_fd = open(lc->path, O_DIRECTORY|O_CLOEXEC|O_RDONLY);
3170                         if (sub_fd < 0 && !IN_SET(errno,
3171                                                   ENOTDIR,  /* Not a directory */
3172                                                   ENOENT))  /* Doesn't exist? */
3173                                 return log_debug_errno(errno, "Failed to open '%s': %m", lc->path);
3174                 }
3175
3176                 if (sub_fd < 0)
3177                         /* Regular file (incl. a credential passed in from higher up) */
3178                         r = load_credential(
3179                                         context,
3180                                         params,
3181                                         lc->id,
3182                                         lc->path,
3183                                         lc->encrypted,
3184                                         unit,
3185                                         AT_FDCWD,
3186                                         dfd,
3187                                         uid,
3188                                         ownership_ok,
3189                                         &left);
3190                 else
3191                         /* Directory */
3192                         r = recurse_dir(
3193                                         sub_fd,
3194                                         /* path= */ lc->id, /* recurse_dir() will suffix the subdir paths from here to the top-level id */
3195                                         /* statx_mask= */ 0,
3196                                         /* n_depth_max= */ UINT_MAX,
3197                                         RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT|RECURSE_DIR_ENSURE_TYPE,
3198                                         load_cred_recurse_dir_cb,
3199                                         &(struct load_cred_args) {
3200                                                 .context = context,
3201                                                 .params = params,
3202                                                 .encrypted = lc->encrypted,
3203                                                 .unit = unit,
3204                                                 .dfd = dfd,
3205                                                 .uid = uid,
3206                                                 .ownership_ok = ownership_ok,
3207                                                 .left = &left,
3208                                         });
3209                 if (r < 0)
3210                         return r;
3211         }
3212
3213         /* Next, look for system credentials and credentials in the credentials store. Note that these do not
3214          * override any credentials found earlier. */
3215         SET_FOREACH(ic, context->import_credentials) {
3216                 _cleanup_free_ char **search_path = NULL;
3217
3218                 search_path = credential_search_path(params, CREDENTIAL_SEARCH_PATH_TRUSTED);
3219                 if (!search_path)
3220                         return -ENOMEM;
3221
3222                 r = load_credential_glob(
3223                                 ic,
3224                                 /* encrypted = */ false,
3225                                 search_path,
3226                                 READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER,
3227                                 dfd,
3228                                 uid,
3229                                 ownership_ok,
3230                                 &left);
3231                 if (r < 0)
3232                         return r;
3233
3234                 search_path = strv_free(search_path);
3235                 search_path = credential_search_path(params, CREDENTIAL_SEARCH_PATH_ENCRYPTED);
3236                 if (!search_path)
3237                         return -ENOMEM;
3238
3239                 r = load_credential_glob(
3240                                 ic,
3241                                 /* encrypted = */ true,
3242                                 search_path,
3243                                 READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER|READ_FULL_FILE_UNBASE64,
3244                                 dfd,
3245                                 uid,
3246                                 ownership_ok,
3247                                 &left);
3248                 if (r < 0)
3249                         return r;
3250         }
3251
3252         /* Finally, we add in literally specified credentials. If the credentials already exist, we'll not
3253          * add them, so that they can act as a "default" if the same credential is specified multiple times. */
3254         HASHMAP_FOREACH(sc, context->set_credentials) {
3255                 _cleanup_(erase_and_freep) void *plaintext = NULL;
3256                 const char *data;
3257                 size_t size, add;
3258
3259                 /* Note that we check ahead of time here instead of relying on O_EXCL|O_CREAT later to return
3260                  * EEXIST if the credential already exists. That's because the TPM2-based decryption is kinda
3261                  * slow and involved, hence it's nice to be able to skip that if the credential already
3262                  * exists anyway. */
3263                 if (faccessat(dfd, sc->id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0)
3264                         continue;
3265                 if (errno != ENOENT)
3266                         return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sc->id);
3267
3268                 if (sc->encrypted) {
3269                         r = decrypt_credential_and_warn(sc->id, now(CLOCK_REALTIME), NULL, NULL, sc->data, sc->size, &plaintext, &size);
3270                         if (r < 0)
3271                                 return r;
3272
3273                         data = plaintext;
3274                 } else {
3275                         data = sc->data;
3276                         size = sc->size;
3277                 }
3278
3279                 add = strlen(sc->id) + size;
3280                 if (add > left)
3281                         return -E2BIG;
3282
3283                 r = write_credential(dfd, sc->id, data, size, uid, ownership_ok);
3284                 if (r < 0)
3285                         return r;
3286
3287                 left -= add;
3288         }
3289
3290         r = fd_acl_make_read_only(dfd); /* Now take away the "w" bit */
3291         if (r < 0)
3292                 return r;
3293
3294         /* After we created all keys with the right perms, also make sure the credential store as a whole is
3295          * accessible */
3296
3297         if (uid_is_valid(uid) && uid != getuid()) {
3298                 r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE);
3299                 if (r < 0) {
3300                         if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
3301                                 return r;
3302
3303                         if (!ownership_ok)
3304                                 return r;
3305
3306                         if (fchown(dfd, uid, GID_INVALID) < 0)
3307                                 return -errno;
3308                 }
3309         }
3310
3311         return 0;
3312 }
3313
3314 static int setup_credentials_internal(
3315                 const ExecContext *context,
3316                 const ExecParameters *params,
3317                 const char *unit,
3318                 const char *final,        /* This is where the credential store shall eventually end up at */
3319                 const char *workspace,    /* This is where we can prepare it before moving it to the final place */
3320                 bool reuse_workspace,     /* Whether to reuse any existing workspace mount if it already is a mount */
3321                 bool must_mount,          /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
3322                 uid_t uid) {
3323
3324         int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
3325                                    * if we mounted something; false if we definitely can't mount anything */
3326         bool final_mounted;
3327         const char *where;
3328
3329         assert(context);
3330         assert(final);
3331         assert(workspace);
3332
3333         if (reuse_workspace) {
3334                 r = path_is_mount_point(workspace, NULL, 0);
3335                 if (r < 0)
3336                         return r;
3337                 if (r > 0)
3338                         workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse it, let's keep this in mind */
3339                 else
3340                         workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
3341         } else
3342                 workspace_mounted = -1; /* ditto */
3343
3344         r = path_is_mount_point(final, NULL, 0);
3345         if (r < 0)
3346                 return r;
3347         if (r > 0) {
3348                 /* If the final place already has something mounted, we use that. If the workspace also has
3349                  * something mounted we assume it's actually the same mount (but with MS_RDONLY
3350                  * different). */
3351                 final_mounted = true;
3352
3353                 if (workspace_mounted < 0) {
3354                         /* If the final place is mounted, but the workspace isn't, then let's bind mount
3355                          * the final version to the workspace, and make it writable, so that we can make
3356                          * changes */
3357
3358                         r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
3359                         if (r < 0)
3360                                 return r;
3361
3362                         r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false), NULL);
3363                         if (r < 0)
3364                                 return r;
3365
3366                         workspace_mounted = true;
3367                 }
3368         } else
3369                 final_mounted = false;
3370
3371         if (workspace_mounted < 0) {
3372                 /* Nothing is mounted on the workspace yet, let's try to mount something now */
3373
3374                 r = mount_credentials_fs(workspace, CREDENTIALS_TOTAL_SIZE_MAX, /* ro= */ false);
3375                 if (r < 0) {
3376                         /* If that didn't work, try to make a bind mount from the final to the workspace, so that we can make it writable there. */
3377                         r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
3378                         if (r < 0) {
3379                                 if (!ERRNO_IS_PRIVILEGE(r)) /* Propagate anything that isn't a permission problem */
3380                                         return r;
3381
3382                                 if (must_mount) /* If we it's not OK to use the plain directory
3383                                                  * fallback, propagate all errors too */
3384                                         return r;
3385
3386                                 /* If we lack privileges to bind mount stuff, then let's gracefully
3387                                  * proceed for compat with container envs, and just use the final dir
3388                                  * as is. */
3389
3390                                 workspace_mounted = false;
3391                         } else {
3392                                 /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
3393                                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false), NULL);
3394                                 if (r < 0)
3395                                         return r;
3396
3397                                 workspace_mounted = true;
3398                         }
3399                 } else
3400                         workspace_mounted = true;
3401         }
3402
3403         assert(!must_mount || workspace_mounted > 0);
3404         where = workspace_mounted ? workspace : final;
3405
3406         (void) label_fix_full(AT_FDCWD, where, final, 0);
3407
3408         r = acquire_credentials(context, params, unit, where, uid, workspace_mounted);
3409         if (r < 0)
3410                 return r;
3411
3412         if (workspace_mounted) {
3413                 bool install;
3414
3415                 /* Determine if we should actually install the prepared mount in the final location by bind
3416                  * mounting it there. We do so only if the mount is not established there already, and if the
3417                  * mount is actually non-empty (i.e. carries at least one credential). Not that in the best
3418                  * case we are doing all this in a mount namespace, thus no one else will see that we
3419                  * allocated a file system we are getting rid of again here. */
3420                 if (final_mounted)
3421                         install = false; /* already installed */
3422                 else {
3423                         r = dir_is_empty(where, /* ignore_hidden_or_backup= */ false);
3424                         if (r < 0)
3425                                 return r;
3426
3427                         install = r == 0; /* install only if non-empty */
3428                 }
3429
3430                 if (install) {
3431                         /* Make workspace read-only now, so that any bind mount we make from it defaults to read-only too */
3432                         r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ true), NULL);
3433                         if (r < 0)
3434                                 return r;
3435
3436                         /* And mount it to the final place, read-only */
3437                         r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
3438                 } else
3439                         /* Otherwise get rid of it */
3440                         r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
3441                 if (r < 0)
3442                         return r;
3443         } else {
3444                 _cleanup_free_ char *parent = NULL;
3445
3446                 /* If we do not have our own mount put used the plain directory fallback, then we need to
3447                  * open access to the top-level credential directory and the per-service directory now */
3448
3449                 r = path_extract_directory(final, &parent);
3450                 if (r < 0)
3451                         return r;
3452                 if (chmod(parent, 0755) < 0)
3453                         return -errno;
3454         }
3455
3456         return 0;
3457 }
3458
3459 static int setup_credentials(
3460                 const ExecContext *context,
3461                 const ExecParameters *params,
3462                 const char *unit,
3463                 uid_t uid) {
3464
3465         _cleanup_free_ char *p = NULL, *q = NULL;
3466         int r;
3467
3468         assert(context);
3469         assert(params);
3470
3471         if (!exec_context_has_credentials(context))
3472                 return 0;
3473
3474         if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
3475                 return -EINVAL;
3476
3477         /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
3478          * and the subdir we mount over with a read-only file system readable by the service's user */
3479         q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials");
3480         if (!q)
3481                 return -ENOMEM;
3482
3483         r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */
3484         if (r < 0 && r != -EEXIST)
3485                 return r;
3486
3487         p = path_join(q, unit);
3488         if (!p)
3489                 return -ENOMEM;
3490
3491         r = mkdir_label(p, 0700); /* per-unit dir: private to user */
3492         if (r < 0 && r != -EEXIST)
3493                 return r;
3494
3495         r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
3496         if (r < 0) {
3497                 _cleanup_free_ char *t = NULL, *u = NULL;
3498
3499                 /* If this is not a privilege or support issue then propagate the error */
3500                 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
3501                         return r;
3502
3503                 /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
3504                  * it into place, so that users can't access half-initialized credential stores. */
3505                 t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
3506                 if (!t)
3507                         return -ENOMEM;
3508
3509                 /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
3510                  * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
3511                  * after it is fully set up */
3512                 u = path_join(t, unit);
3513                 if (!u)
3514                         return -ENOMEM;
3515
3516                 FOREACH_STRING(i, t, u) {
3517                         r = mkdir_label(i, 0700);
3518                         if (r < 0 && r != -EEXIST)
3519                                 return r;
3520                 }
3521
3522                 r = setup_credentials_internal(
3523                                 context,
3524                                 params,
3525                                 unit,
3526                                 p,       /* final mount point */
3527                                 u,       /* temporary workspace to overmount */
3528                                 true,    /* reuse the workspace if it is already a mount */
3529                                 false,   /* it's OK to fall back to a plain directory if we can't mount anything */
3530                                 uid);
3531
3532                 (void) rmdir(u); /* remove the workspace again if we can. */
3533
3534                 if (r < 0)
3535                         return r;
3536
3537         } else if (r == 0) {
3538
3539                 /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
3540                  * we can use the same directory for all cases, after turning off propagation. Question
3541                  * though is: where do we turn off propagation exactly, and where do we place the workspace
3542                  * directory? We need some place that is guaranteed to be a mount point in the host, and
3543                  * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
3544                  * since we ultimately want to move the resulting file system there, i.e. we need propagation
3545                  * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
3546                  * would be visible in the host mount table all the time, which we want to avoid. Hence, what
3547                  * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
3548                  * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
3549                  * propagation on the former, and then overmount the latter.
3550                  *
3551                  * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
3552                  * for this purpose, but there are few other candidates that work equally well for us, and
3553                  * given that the we do this in a privately namespaced short-lived single-threaded process
3554                  * that no one else sees this should be OK to do. */
3555
3556                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL); /* Turn off propagation from our namespace to host */
3557                 if (r < 0)
3558                         goto child_fail;
3559
3560                 r = setup_credentials_internal(
3561                                 context,
3562                                 params,
3563                                 unit,
3564                                 p,           /* final mount point */
3565                                 "/dev/shm",  /* temporary workspace to overmount */
3566                                 false,       /* do not reuse /dev/shm if it is already a mount, under no circumstances */
3567                                 true,        /* insist that something is mounted, do not allow fallback to plain directory */
3568                                 uid);
3569                 if (r < 0)
3570                         goto child_fail;
3571
3572                 _exit(EXIT_SUCCESS);
3573
3574         child_fail:
3575                 _exit(EXIT_FAILURE);
3576         }
3577
3578         /* If the credentials dir is empty and not a mount point, then there's no point in having it. Let's
3579          * try to remove it. This matters in particular if we created the dir as mount point but then didn't
3580          * actually end up mounting anything on it. In that case we'd rather have ENOENT than EACCESS being
3581          * seen by users when trying access this inode. */
3582         (void) rmdir(p);
3583         return 0;
3584 }
3585
3586 #if ENABLE_SMACK
3587 static int setup_smack(
3588                 const Manager *manager,
3589                 const ExecContext *context,
3590                 int executable_fd) {
3591         int r;
3592
3593         assert(context);
3594         assert(executable_fd >= 0);
3595
3596         if (context->smack_process_label) {
3597                 r = mac_smack_apply_pid(0, context->smack_process_label);
3598                 if (r < 0)
3599                         return r;
3600         } else if (manager->default_smack_process_label) {
3601                 _cleanup_free_ char *exec_label = NULL;
3602
3603                 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
3604                 if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
3605                         return r;
3606
3607                 r = mac_smack_apply_pid(0, exec_label ?: manager->default_smack_process_label);
3608                 if (r < 0)
3609                         return r;
3610         }
3611
3612         return 0;
3613 }
3614 #endif
3615
3616 static int compile_bind_mounts(
3617                 const ExecContext *context,
3618                 const ExecParameters *params,
3619                 BindMount **ret_bind_mounts,
3620                 size_t *ret_n_bind_mounts,
3621                 char ***ret_empty_directories) {
3622
3623         _cleanup_strv_free_ char **empty_directories = NULL;
3624         BindMount *bind_mounts = NULL;
3625         size_t n, h = 0;
3626         int r;
3627
3628         assert(context);
3629         assert(params);
3630         assert(ret_bind_mounts);
3631         assert(ret_n_bind_mounts);
3632         assert(ret_empty_directories);
3633
3634         CLEANUP_ARRAY(bind_mounts, h, bind_mount_free_many);
3635
3636         n = context->n_bind_mounts;
3637         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3638                 if (!params->prefix[t])
3639                         continue;
3640
3641                 for (size_t i = 0; i < context->directories[t].n_items; i++)
3642                         n += !context->directories[t].items[i].only_create;
3643         }
3644
3645         if (n <= 0) {
3646                 *ret_bind_mounts = NULL;
3647                 *ret_n_bind_mounts = 0;
3648                 *ret_empty_directories = NULL;
3649                 return 0;
3650         }
3651
3652         bind_mounts = new(BindMount, n);
3653         if (!bind_mounts)
3654                 return -ENOMEM;
3655
3656         for (size_t i = 0; i < context->n_bind_mounts; i++) {
3657                 BindMount *item = context->bind_mounts + i;
3658                 _cleanup_free_ char *s = NULL, *d = NULL;
3659
3660                 s = strdup(item->source);
3661                 if (!s)
3662                         return -ENOMEM;
3663
3664                 d = strdup(item->destination);
3665                 if (!d)
3666                         return -ENOMEM;
3667
3668                 bind_mounts[h++] = (BindMount) {
3669                         .source = TAKE_PTR(s),
3670                         .destination = TAKE_PTR(d),
3671                         .read_only = item->read_only,
3672                         .recursive = item->recursive,
3673                         .ignore_enoent = item->ignore_enoent,
3674                 };
3675         }
3676
3677         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3678                 if (!params->prefix[t])
3679                         continue;
3680
3681                 if (context->directories[t].n_items == 0)
3682                         continue;
3683
3684                 if (exec_directory_is_private(context, t) &&
3685                     !exec_context_with_rootfs(context)) {
3686                         char *private_root;
3687
3688                         /* So this is for a dynamic user, and we need to make sure the process can access its own
3689                          * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3690                          * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3691
3692                         private_root = path_join(params->prefix[t], "private");
3693                         if (!private_root)
3694                                 return -ENOMEM;
3695
3696                         r = strv_consume(&empty_directories, private_root);
3697                         if (r < 0)
3698                                 return r;
3699                 }
3700
3701                 for (size_t i = 0; i < context->directories[t].n_items; i++) {
3702                         _cleanup_free_ char *s = NULL, *d = NULL;
3703
3704                         /* When one of the parent directories is in the list, we cannot create the symlink
3705                          * for the child directory. See also the comments in setup_exec_directory(). */
3706                         if (context->directories[t].items[i].only_create)
3707                                 continue;
3708
3709                         if (exec_directory_is_private(context, t))
3710                                 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
3711                         else
3712                                 s = path_join(params->prefix[t], context->directories[t].items[i].path);
3713                         if (!s)
3714                                 return -ENOMEM;
3715
3716                         if (exec_directory_is_private(context, t) &&
3717                             exec_context_with_rootfs(context))
3718                                 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3719                                  * directory is not created on the root directory. So, let's bind-mount the directory
3720                                  * on the 'non-private' place. */
3721                                 d = path_join(params->prefix[t], context->directories[t].items[i].path);
3722                         else
3723                                 d = strdup(s);
3724                         if (!d)
3725                                 return -ENOMEM;
3726
3727                         bind_mounts[h++] = (BindMount) {
3728                                 .source = TAKE_PTR(s),
3729                                 .destination = TAKE_PTR(d),
3730                                 .read_only = false,
3731                                 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
3732                                 .recursive = true,
3733                                 .ignore_enoent = false,
3734                         };
3735                 }
3736         }
3737
3738         assert(h == n);
3739
3740         *ret_bind_mounts = TAKE_PTR(bind_mounts);
3741         *ret_n_bind_mounts = n;
3742         *ret_empty_directories = TAKE_PTR(empty_directories);
3743
3744         return (int) n;
3745 }
3746
3747 /* ret_symlinks will contain a list of pairs src:dest that describes
3748  * the symlinks to create later on. For example, the symlinks needed
3749  * to safely give private directories to DynamicUser=1 users. */
3750 static int compile_symlinks(
3751                 const ExecContext *context,
3752                 const ExecParameters *params,
3753                 char ***ret_symlinks) {
3754
3755         _cleanup_strv_free_ char **symlinks = NULL;
3756         int r;
3757
3758         assert(context);
3759         assert(params);
3760         assert(ret_symlinks);
3761
3762         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3763                 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
3764                         _cleanup_free_ char *private_path = NULL, *path = NULL;
3765
3766                         STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
3767                                 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
3768
3769                                 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3770                                 dst_abs = path_join(params->prefix[dt], *symlink);
3771                                 if (!src_abs || !dst_abs)
3772                                         return -ENOMEM;
3773
3774                                 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
3775                                 if (r < 0)
3776                                         return r;
3777                         }
3778
3779                         if (!exec_directory_is_private(context, dt) ||
3780                             exec_context_with_rootfs(context) ||
3781                             context->directories[dt].items[i].only_create)
3782                                 continue;
3783
3784                         private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
3785                         if (!private_path)
3786                                 return -ENOMEM;
3787
3788                         path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3789                         if (!path)
3790                                 return -ENOMEM;
3791
3792                         r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
3793                         if (r < 0)
3794                                 return r;
3795                 }
3796         }
3797
3798         *ret_symlinks = TAKE_PTR(symlinks);
3799
3800         return 0;
3801 }
3802
3803 static bool insist_on_sandboxing(
3804                 const ExecContext *context,
3805                 const char *root_dir,
3806                 const char *root_image,
3807                 const BindMount *bind_mounts,
3808                 size_t n_bind_mounts) {
3809
3810         assert(context);
3811         assert(n_bind_mounts == 0 || bind_mounts);
3812
3813         /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
3814          * would alter the view on the file system beyond making things read-only or invisible, i.e. would
3815          * rearrange stuff in a way we cannot ignore gracefully. */
3816
3817         if (context->n_temporary_filesystems > 0)
3818                 return true;
3819
3820         if (root_dir || root_image)
3821                 return true;
3822
3823         if (context->n_mount_images > 0)
3824                 return true;
3825
3826         if (context->dynamic_user)
3827                 return true;
3828
3829         if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
3830                 return true;
3831
3832         /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3833          * essential. */
3834         for (size_t i = 0; i < n_bind_mounts; i++)
3835                 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
3836                         return true;
3837
3838         if (context->log_namespace)
3839                 return true;
3840
3841         return false;
3842 }
3843
3844 static int setup_ephemeral(const ExecContext *context, ExecRuntime *runtime) {
3845         _cleanup_close_ int fd = -EBADF;
3846         int r;
3847
3848         if (!runtime || !runtime->ephemeral_copy)
3849                 return 0;
3850
3851         r = posix_lock(runtime->ephemeral_storage_socket[0], LOCK_EX);
3852         if (r < 0)
3853                 return log_debug_errno(r, "Failed to lock ephemeral storage socket: %m");
3854
3855         CLEANUP_POSIX_UNLOCK(runtime->ephemeral_storage_socket[0]);
3856
3857         fd = receive_one_fd(runtime->ephemeral_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
3858         if (fd >= 0)
3859                 /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */
3860                 return 0;
3861
3862         if (fd != -EAGAIN)
3863                 return log_debug_errno(fd, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
3864
3865         log_debug("Making ephemeral snapshot of %s to %s",
3866                   context->root_image ?: context->root_directory, runtime->ephemeral_copy);
3867
3868         if (context->root_image)
3869                 fd = copy_file(context->root_image, runtime->ephemeral_copy, O_EXCL, 0600,
3870                                COPY_LOCK_BSD|COPY_REFLINK|COPY_CRTIME);
3871         else
3872                 fd = btrfs_subvol_snapshot_at(AT_FDCWD, context->root_directory,
3873                                               AT_FDCWD, runtime->ephemeral_copy,
3874                                               BTRFS_SNAPSHOT_FALLBACK_COPY |
3875                                               BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
3876                                               BTRFS_SNAPSHOT_RECURSIVE |
3877                                               BTRFS_SNAPSHOT_LOCK_BSD);
3878         if (fd < 0)
3879                 return log_debug_errno(fd, "Failed to snapshot %s to %s: %m",
3880                                        context->root_image ?: context->root_directory, runtime->ephemeral_copy);
3881
3882         if (context->root_image) {
3883                 /* A root image might be subject to lots of random writes so let's try to disable COW on it
3884                  * which tends to not perform well in combination with lots of random writes.
3885                  *
3886                  * Note: btrfs actually isn't impressed by us setting the flag after making the reflink'ed
3887                  * copy, but we at least want to make the intention clear.
3888                  */
3889                 r = chattr_fd(fd, FS_NOCOW_FL, FS_NOCOW_FL, NULL);
3890                 if (r < 0)
3891                         log_debug_errno(fd, "Failed to disable copy-on-write for %s, ignoring: %m", runtime->ephemeral_copy);
3892         }
3893
3894         r = send_one_fd(runtime->ephemeral_storage_socket[1], fd, MSG_DONTWAIT);
3895         if (r < 0)
3896                 return log_debug_errno(r, "Failed to queue file descriptor on ephemeral storage socket: %m");
3897
3898         return 1;
3899 }
3900
3901 static int verity_settings_prepare(
3902                 VeritySettings *verity,
3903                 const char *root_image,
3904                 const void *root_hash,
3905                 size_t root_hash_size,
3906                 const char *root_hash_path,
3907                 const void *root_hash_sig,
3908                 size_t root_hash_sig_size,
3909                 const char *root_hash_sig_path,
3910                 const char *verity_data_path) {
3911
3912         int r;
3913
3914         assert(verity);
3915
3916         if (root_hash) {
3917                 void *d;
3918
3919                 d = memdup(root_hash, root_hash_size);
3920                 if (!d)
3921                         return -ENOMEM;
3922
3923                 free_and_replace(verity->root_hash, d);
3924                 verity->root_hash_size = root_hash_size;
3925                 verity->designator = PARTITION_ROOT;
3926         }
3927
3928         if (root_hash_sig) {
3929                 void *d;
3930
3931                 d = memdup(root_hash_sig, root_hash_sig_size);
3932                 if (!d)
3933                         return -ENOMEM;
3934
3935                 free_and_replace(verity->root_hash_sig, d);
3936                 verity->root_hash_sig_size = root_hash_sig_size;
3937                 verity->designator = PARTITION_ROOT;
3938         }
3939
3940         if (verity_data_path) {
3941                 r = free_and_strdup(&verity->data_path, verity_data_path);
3942                 if (r < 0)
3943                         return r;
3944         }
3945
3946         r = verity_settings_load(
3947                         verity,
3948                         root_image,
3949                         root_hash_path,
3950                         root_hash_sig_path);
3951         if (r < 0)
3952                 return log_debug_errno(r, "Failed to load root hash: %m");
3953
3954         return 0;
3955 }
3956
3957 static int apply_mount_namespace(
3958                 const Unit *u,
3959                 ExecCommandFlags command_flags,
3960                 const ExecContext *context,
3961                 const ExecParameters *params,
3962                 ExecRuntime *runtime,
3963                 const char *memory_pressure_path,
3964                 char **error_path) {
3965
3966         _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
3967         _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
3968                         **read_write_paths_cleanup = NULL;
3969         _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
3970                         *extension_dir = NULL, *host_os_release = NULL;
3971         const char *root_dir = NULL, *root_image = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
3972         char **read_write_paths;
3973         NamespaceInfo ns_info;
3974         bool needs_sandboxing;
3975         BindMount *bind_mounts = NULL;
3976         size_t n_bind_mounts = 0;
3977         int r;
3978
3979         assert(context);
3980
3981         CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many);
3982
3983         if (params->flags & EXEC_APPLY_CHROOT) {
3984                 r = setup_ephemeral(context, runtime);
3985                 if (r < 0)
3986                         return r;
3987
3988                 if (context->root_image)
3989                         root_image = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_image;
3990                 else
3991                         root_dir = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory;
3992         }
3993
3994         r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3995         if (r < 0)
3996                 return r;
3997
3998         /* Symlinks for exec dirs are set up after other mounts, before they are made read-only. */
3999         r = compile_symlinks(context, params, &symlinks);
4000         if (r < 0)
4001                 return r;
4002
4003         /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
4004          * service will need to write to it in order to start the notifications. */
4005         if (context->protect_control_groups && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
4006                 read_write_paths_cleanup = strv_copy(context->read_write_paths);
4007                 if (!read_write_paths_cleanup)
4008                         return -ENOMEM;
4009
4010                 r = strv_extend(&read_write_paths_cleanup, memory_pressure_path);
4011                 if (r < 0)
4012                         return r;
4013
4014                 read_write_paths = read_write_paths_cleanup;
4015         } else
4016                 read_write_paths = context->read_write_paths;
4017
4018         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
4019         if (needs_sandboxing) {
4020                 /* The runtime struct only contains the parent of the private /tmp,
4021                  * which is non-accessible to world users. Inside of it there's a /tmp
4022                  * that is sticky, and that's the one we want to use here.
4023                  * This does not apply when we are using /run/systemd/empty as fallback. */
4024
4025                 if (context->private_tmp && runtime && runtime->shared) {
4026                         if (streq_ptr(runtime->shared->tmp_dir, RUN_SYSTEMD_EMPTY))
4027                                 tmp_dir = runtime->shared->tmp_dir;
4028                         else if (runtime->shared->tmp_dir)
4029                                 tmp_dir = strjoina(runtime->shared->tmp_dir, "/tmp");
4030
4031                         if (streq_ptr(runtime->shared->var_tmp_dir, RUN_SYSTEMD_EMPTY))
4032                                 var_tmp_dir = runtime->shared->var_tmp_dir;
4033                         else if (runtime->shared->var_tmp_dir)
4034                                 var_tmp_dir = strjoina(runtime->shared->var_tmp_dir, "/tmp");
4035                 }
4036
4037                 ns_info = (NamespaceInfo) {
4038                         .ignore_protect_paths = false,
4039                         .private_dev = context->private_devices,
4040                         .protect_control_groups = context->protect_control_groups,
4041                         .protect_kernel_tunables = context->protect_kernel_tunables,
4042                         .protect_kernel_modules = context->protect_kernel_modules,
4043                         .protect_kernel_logs = context->protect_kernel_logs,
4044                         .protect_hostname = context->protect_hostname,
4045                         .mount_apivfs = exec_context_get_effective_mount_apivfs(context),
4046                         .protect_home = context->protect_home,
4047                         .protect_system = context->protect_system,
4048                         .protect_proc = context->protect_proc,
4049                         .proc_subset = context->proc_subset,
4050                         .private_network = exec_needs_network_namespace(context),
4051                         .private_ipc = exec_needs_ipc_namespace(context),
4052                         /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
4053                         .mount_nosuid = context->no_new_privileges && !mac_selinux_use(),
4054                 };
4055         } else if (!context->dynamic_user && root_dir)
4056                 /*
4057                  * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
4058                  * sandbox info, otherwise enforce it, don't ignore protected paths and
4059                  * fail if we are enable to apply the sandbox inside the mount namespace.
4060                  */
4061                 ns_info = (NamespaceInfo) {
4062                         .ignore_protect_paths = true,
4063                 };
4064         else
4065                 ns_info = (NamespaceInfo) {};
4066
4067         if (context->mount_propagation_flag == MS_SHARED)
4068                 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
4069
4070         if (exec_context_has_credentials(context) &&
4071             params->prefix[EXEC_DIRECTORY_RUNTIME] &&
4072             FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4073                 creds_path = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials", u->id);
4074                 if (!creds_path)
4075                         return -ENOMEM;
4076         }
4077
4078         if (params->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
4079                 propagate_dir = path_join("/run/systemd/propagate/", u->id);
4080                 if (!propagate_dir)
4081                         return -ENOMEM;
4082
4083                 incoming_dir = strdup("/run/systemd/incoming");
4084                 if (!incoming_dir)
4085                         return -ENOMEM;
4086
4087                 extension_dir = strdup("/run/systemd/unit-extensions");
4088                 if (!extension_dir)
4089                         return -ENOMEM;
4090
4091                 /* If running under a different root filesystem, propagate the host's os-release. We make a
4092                  * copy rather than just bind mounting it, so that it can be updated on soft-reboot. */
4093                 if (root_dir || root_image) {
4094                         host_os_release = strdup("/run/systemd/propagate/os-release");
4095                         if (!host_os_release)
4096                                 return -ENOMEM;
4097                 }
4098         } else {
4099                 assert(params->runtime_scope == RUNTIME_SCOPE_USER);
4100
4101                 if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0)
4102                         return -ENOMEM;
4103
4104                 if (root_dir || root_image) {
4105                         if (asprintf(&host_os_release, "/run/user/" UID_FMT "/systemd/propagate/os-release", geteuid()) < 0)
4106                                 return -ENOMEM;
4107                 }
4108         }
4109
4110         if (root_image) {
4111                 r = verity_settings_prepare(
4112                         &verity,
4113                         root_image,
4114                         context->root_hash, context->root_hash_size, context->root_hash_path,
4115                         context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
4116                         context->root_verity);
4117                 if (r < 0)
4118                         return r;
4119         }
4120
4121         r = setup_namespace(
4122                         root_dir,
4123                         root_image,
4124                         context->root_image_options,
4125                         context->root_image_policy ?: &image_policy_service,
4126                         &ns_info,
4127                         read_write_paths,
4128                         needs_sandboxing ? context->read_only_paths : NULL,
4129                         needs_sandboxing ? context->inaccessible_paths : NULL,
4130                         needs_sandboxing ? context->exec_paths : NULL,
4131                         needs_sandboxing ? context->no_exec_paths : NULL,
4132                         empty_directories,
4133                         symlinks,
4134                         bind_mounts,
4135                         n_bind_mounts,
4136                         context->temporary_filesystems,
4137                         context->n_temporary_filesystems,
4138                         context->mount_images,
4139                         context->n_mount_images,
4140                         context->mount_image_policy ?: &image_policy_service,
4141                         tmp_dir,
4142                         var_tmp_dir,
4143                         creds_path,
4144                         context->log_namespace,
4145                         context->mount_propagation_flag,
4146                         &verity,
4147                         context->extension_images,
4148                         context->n_extension_images,
4149                         context->extension_image_policy ?: &image_policy_sysext,
4150                         context->extension_directories,
4151                         propagate_dir,
4152                         incoming_dir,
4153                         extension_dir,
4154                         root_dir || root_image ? params->notify_socket : NULL,
4155                         host_os_release,
4156                         error_path);
4157
4158         /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
4159          * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
4160          * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
4161          * completely different execution environment. */
4162         if (r == -ENOANO) {
4163                 if (insist_on_sandboxing(
4164                                     context,
4165                                     root_dir, root_image,
4166                                     bind_mounts,
4167                                     n_bind_mounts))
4168                         return log_unit_debug_errno(u,
4169                                                     SYNTHETIC_ERRNO(EOPNOTSUPP),
4170                                                     "Failed to set up namespace, and refusing to continue since "
4171                                                     "the selected namespacing options alter mount environment non-trivially.\n"
4172                                                     "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
4173                                                     n_bind_mounts,
4174                                                     context->n_temporary_filesystems,
4175                                                     yes_no(root_dir),
4176                                                     yes_no(root_image),
4177                                                     yes_no(context->dynamic_user));
4178
4179                 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
4180                 return 0;
4181         }
4182
4183         return r;
4184 }
4185
4186 static int apply_working_directory(
4187                 const ExecContext *context,
4188                 const ExecParameters *params,
4189                 ExecRuntime *runtime,
4190                 const char *home,
4191                 int *exit_status) {
4192
4193         const char *d, *wd;
4194
4195         assert(context);
4196         assert(exit_status);
4197
4198         if (context->working_directory_home) {
4199
4200                 if (!home) {
4201                         *exit_status = EXIT_CHDIR;
4202                         return -ENXIO;
4203                 }
4204
4205                 wd = home;
4206
4207         } else
4208                 wd = empty_to_root(context->working_directory);
4209
4210         if (params->flags & EXEC_APPLY_CHROOT)
4211                 d = wd;
4212         else
4213                 d = prefix_roota((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory, wd);
4214
4215         if (chdir(d) < 0 && !context->working_directory_missing_ok) {
4216                 *exit_status = EXIT_CHDIR;
4217                 return -errno;
4218         }
4219
4220         return 0;
4221 }
4222
4223 static int apply_root_directory(
4224                 const ExecContext *context,
4225                 const ExecParameters *params,
4226                 ExecRuntime *runtime,
4227                 const bool needs_mount_ns,
4228                 int *exit_status) {
4229
4230         assert(context);
4231         assert(exit_status);
4232
4233         if (params->flags & EXEC_APPLY_CHROOT)
4234                 if (!needs_mount_ns && context->root_directory)
4235                         if (chroot((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory) < 0) {
4236                                 *exit_status = EXIT_CHROOT;
4237                                 return -errno;
4238                         }
4239
4240         return 0;
4241 }
4242
4243 static int setup_keyring(
4244                 const Unit *u,
4245                 const ExecContext *context,
4246                 const ExecParameters *p,
4247                 uid_t uid, gid_t gid) {
4248
4249         key_serial_t keyring;
4250         int r = 0;
4251         uid_t saved_uid;
4252         gid_t saved_gid;
4253
4254         assert(u);
4255         assert(context);
4256         assert(p);
4257
4258         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
4259          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
4260          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
4261          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
4262          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
4263          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
4264
4265         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
4266                 return 0;
4267
4268         /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
4269          * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
4270          * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
4271          * & group is just as nasty as acquiring a reference to the user keyring. */
4272
4273         saved_uid = getuid();
4274         saved_gid = getgid();
4275
4276         if (gid_is_valid(gid) && gid != saved_gid) {
4277                 if (setregid(gid, -1) < 0)
4278                         return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
4279         }
4280
4281         if (uid_is_valid(uid) && uid != saved_uid) {
4282                 if (setreuid(uid, -1) < 0) {
4283                         r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
4284                         goto out;
4285                 }
4286         }
4287
4288         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
4289         if (keyring == -1) {
4290                 if (errno == ENOSYS)
4291                         log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
4292                 else if (ERRNO_IS_PRIVILEGE(errno))
4293                         log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
4294                 else if (errno == EDQUOT)
4295                         log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
4296                 else
4297                         r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
4298
4299                 goto out;
4300         }
4301
4302         /* When requested link the user keyring into the session keyring. */
4303         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
4304
4305                 if (keyctl(KEYCTL_LINK,
4306                            KEY_SPEC_USER_KEYRING,
4307                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
4308                         r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
4309                         goto out;
4310                 }
4311         }
4312
4313         /* Restore uid/gid back */
4314         if (uid_is_valid(uid) && uid != saved_uid) {
4315                 if (setreuid(saved_uid, -1) < 0) {
4316                         r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
4317                         goto out;
4318                 }
4319         }
4320
4321         if (gid_is_valid(gid) && gid != saved_gid) {
4322                 if (setregid(saved_gid, -1) < 0)
4323                         return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
4324         }
4325
4326         /* Populate they keyring with the invocation ID by default, as original saved_uid. */
4327         if (!sd_id128_is_null(u->invocation_id)) {
4328                 key_serial_t key;
4329
4330                 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
4331                 if (key == -1)
4332                         log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
4333                 else {
4334                         if (keyctl(KEYCTL_SETPERM, key,
4335                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
4336                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
4337                                 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
4338                 }
4339         }
4340
4341 out:
4342         /* Revert back uid & gid for the last time, and exit */
4343         /* no extra logging, as only the first already reported error matters */
4344         if (getuid() != saved_uid)
4345                 (void) setreuid(saved_uid, -1);
4346
4347         if (getgid() != saved_gid)
4348                 (void) setregid(saved_gid, -1);
4349
4350         return r;
4351 }
4352
4353 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
4354         assert(array);
4355         assert(n);
4356         assert(pair);
4357
4358         if (pair[0] >= 0)
4359                 array[(*n)++] = pair[0];
4360         if (pair[1] >= 0)
4361                 array[(*n)++] = pair[1];
4362 }
4363
4364 static int close_remaining_fds(
4365                 const ExecParameters *params,
4366                 const ExecRuntime *runtime,
4367                 int user_lookup_fd,
4368                 int socket_fd,
4369                 const int *fds, size_t n_fds) {
4370
4371         size_t n_dont_close = 0;
4372         int dont_close[n_fds + 14];
4373
4374         assert(params);
4375
4376         if (params->stdin_fd >= 0)
4377                 dont_close[n_dont_close++] = params->stdin_fd;
4378         if (params->stdout_fd >= 0)
4379                 dont_close[n_dont_close++] = params->stdout_fd;
4380         if (params->stderr_fd >= 0)
4381                 dont_close[n_dont_close++] = params->stderr_fd;
4382
4383         if (socket_fd >= 0)
4384                 dont_close[n_dont_close++] = socket_fd;
4385         if (n_fds > 0) {
4386                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
4387                 n_dont_close += n_fds;
4388         }
4389
4390         if (runtime)
4391                 append_socket_pair(dont_close, &n_dont_close, runtime->ephemeral_storage_socket);
4392
4393         if (runtime && runtime->shared) {
4394                 append_socket_pair(dont_close, &n_dont_close, runtime->shared->netns_storage_socket);
4395                 append_socket_pair(dont_close, &n_dont_close, runtime->shared->ipcns_storage_socket);
4396         }
4397
4398         if (runtime && runtime->dynamic_creds) {
4399                 if (runtime->dynamic_creds->user)
4400                         append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->user->storage_socket);
4401                 if (runtime->dynamic_creds->group)
4402                         append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->group->storage_socket);
4403         }
4404
4405         if (user_lookup_fd >= 0)
4406                 dont_close[n_dont_close++] = user_lookup_fd;
4407
4408         return close_all_fds(dont_close, n_dont_close);
4409 }
4410
4411 static int send_user_lookup(
4412                 Unit *unit,
4413                 int user_lookup_fd,
4414                 uid_t uid,
4415                 gid_t gid) {
4416
4417         assert(unit);
4418
4419         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
4420          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
4421          * specified. */
4422
4423         if (user_lookup_fd < 0)
4424                 return 0;
4425
4426         if (!uid_is_valid(uid) && !gid_is_valid(gid))
4427                 return 0;
4428
4429         if (writev(user_lookup_fd,
4430                (struct iovec[]) {
4431                            IOVEC_MAKE(&uid, sizeof(uid)),
4432                            IOVEC_MAKE(&gid, sizeof(gid)),
4433                            IOVEC_MAKE_STRING(unit->id) }, 3) < 0)
4434                 return -errno;
4435
4436         return 0;
4437 }
4438
4439 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
4440         int r;
4441
4442         assert(c);
4443         assert(home);
4444         assert(buf);
4445
4446         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
4447
4448         if (*home)
4449                 return 0;
4450
4451         if (!c->working_directory_home)
4452                 return 0;
4453
4454         r = get_home_dir(buf);
4455         if (r < 0)
4456                 return r;
4457
4458         *home = *buf;
4459         return 1;
4460 }
4461
4462 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
4463         _cleanup_strv_free_ char ** list = NULL;
4464         int r;
4465
4466         assert(c);
4467         assert(p);
4468         assert(ret);
4469
4470         assert(c->dynamic_user);
4471
4472         /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
4473          * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
4474          * directories. */
4475
4476         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
4477                 if (t == EXEC_DIRECTORY_CONFIGURATION)
4478                         continue;
4479
4480                 if (!p->prefix[t])
4481                         continue;
4482
4483                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
4484                         char *e;
4485
4486                         if (exec_directory_is_private(c, t))
4487                                 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
4488                         else
4489                                 e = path_join(p->prefix[t], c->directories[t].items[i].path);
4490                         if (!e)
4491                                 return -ENOMEM;
4492
4493                         r = strv_consume(&list, e);
4494                         if (r < 0)
4495                                 return r;
4496                 }
4497         }
4498
4499         *ret = TAKE_PTR(list);
4500
4501         return 0;
4502 }
4503
4504 static int exec_parameters_get_cgroup_path(
4505                 const ExecParameters *params,
4506                 const CGroupContext *c,
4507                 char **ret) {
4508
4509         const char *subgroup = NULL;
4510         char *p;
4511
4512         assert(params);
4513         assert(ret);
4514
4515         if (!params->cgroup_path)
4516                 return -EINVAL;
4517
4518         /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
4519          * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
4520          * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
4521          * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
4522          * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
4523          * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
4524          * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
4525          * flag, which is only passed for the former statements, not for the latter. */
4526
4527         if (FLAGS_SET(params->flags, EXEC_CGROUP_DELEGATE) && (FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP) || c->delegate_subgroup)) {
4528                 if (FLAGS_SET(params->flags, EXEC_IS_CONTROL))
4529                         subgroup = ".control";
4530                 else
4531                         subgroup = c->delegate_subgroup;
4532         }
4533
4534         if (subgroup)
4535                 p = path_join(params->cgroup_path, subgroup);
4536         else
4537                 p = strdup(params->cgroup_path);
4538         if (!p)
4539                 return -ENOMEM;
4540
4541         *ret = p;
4542         return !!subgroup;
4543 }
4544
4545 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
4546         _cleanup_(cpu_set_reset) CPUSet s = {};
4547         int r;
4548
4549         assert(c);
4550         assert(ret);
4551
4552         if (!c->numa_policy.nodes.set) {
4553                 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
4554                 return 0;
4555         }
4556
4557         r = numa_to_cpu_set(&c->numa_policy, &s);
4558         if (r < 0)
4559                 return r;
4560
4561         cpu_set_reset(ret);
4562
4563         return cpu_set_add_all(ret, &s);
4564 }
4565
4566 bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
4567         assert(c);
4568
4569         return c->cpu_affinity_from_numa;
4570 }
4571
4572 static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
4573         int r;
4574
4575         assert(fds);
4576         assert(n_fds);
4577         assert(*n_fds < fds_size);
4578         assert(ret_fd);
4579
4580         if (fd < 0) {
4581                 *ret_fd = -EBADF;
4582                 return 0;
4583         }
4584
4585         if (fd < 3 + (int) *n_fds) {
4586                 /* Let's move the fd up, so that it's outside of the fd range we will use to store
4587                  * the fds we pass to the process (or which are closed only during execve). */
4588
4589                 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
4590                 if (r < 0)
4591                         return -errno;
4592
4593                 close_and_replace(fd, r);
4594         }
4595
4596         *ret_fd = fds[*n_fds] = fd;
4597         (*n_fds) ++;
4598         return 1;
4599 }
4600
4601 static int connect_unix_harder(Unit *u, const OpenFile *of, int ofd) {
4602         union sockaddr_union addr = {
4603                 .un.sun_family = AF_UNIX,
4604         };
4605         socklen_t sa_len;
4606         static const int socket_types[] = { SOCK_DGRAM, SOCK_STREAM, SOCK_SEQPACKET };
4607         int r;
4608
4609         assert(u);
4610         assert(of);
4611         assert(ofd >= 0);
4612
4613         r = sockaddr_un_set_path(&addr.un, FORMAT_PROC_FD_PATH(ofd));
4614         if (r < 0)
4615                 return log_unit_error_errno(u, r, "Failed to set sockaddr for %s: %m", of->path);
4616
4617         sa_len = r;
4618
4619         for (size_t i = 0; i < ELEMENTSOF(socket_types); i++) {
4620                 _cleanup_close_ int fd = -EBADF;
4621
4622                 fd = socket(AF_UNIX, socket_types[i] | SOCK_CLOEXEC, 0);
4623                 if (fd < 0)
4624                         return log_unit_error_errno(u, errno, "Failed to create socket for %s: %m", of->path);
4625
4626                 r = RET_NERRNO(connect(fd, &addr.sa, sa_len));
4627                 if (r == -EPROTOTYPE)
4628                         continue;
4629                 if (r < 0)
4630                         return log_unit_error_errno(u, r, "Failed to connect socket for %s: %m", of->path);
4631
4632                 return TAKE_FD(fd);
4633         }
4634
4635         return log_unit_error_errno(u, SYNTHETIC_ERRNO(EPROTOTYPE), "Failed to connect socket for \"%s\".", of->path);
4636 }
4637
4638 static int get_open_file_fd(Unit *u, const OpenFile *of) {
4639         struct stat st;
4640         _cleanup_close_ int fd = -EBADF, ofd = -EBADF;
4641
4642         assert(u);
4643         assert(of);
4644
4645         ofd = open(of->path, O_PATH | O_CLOEXEC);
4646         if (ofd < 0)
4647                 return log_unit_error_errno(u, errno, "Could not open \"%s\": %m", of->path);
4648
4649         if (fstat(ofd, &st) < 0)
4650                 return log_unit_error_errno(u, errno, "Failed to stat %s: %m", of->path);
4651
4652         if (S_ISSOCK(st.st_mode)) {
4653                 fd = connect_unix_harder(u, of, ofd);
4654                 if (fd < 0)
4655                         return fd;
4656
4657                 if (FLAGS_SET(of->flags, OPENFILE_READ_ONLY) && shutdown(fd, SHUT_WR) < 0)
4658                         return log_unit_error_errno(u, errno, "Failed to shutdown send for socket %s: %m",
4659                                                     of->path);
4660
4661                 log_unit_debug(u, "socket %s opened (fd=%d)", of->path, fd);
4662         } else {
4663                 int flags = FLAGS_SET(of->flags, OPENFILE_READ_ONLY) ? O_RDONLY : O_RDWR;
4664                 if (FLAGS_SET(of->flags, OPENFILE_APPEND))
4665                         flags |= O_APPEND;
4666                 else if (FLAGS_SET(of->flags, OPENFILE_TRUNCATE))
4667                         flags |= O_TRUNC;
4668
4669                 fd = fd_reopen(ofd, flags | O_CLOEXEC);
4670                 if (fd < 0)
4671                         return log_unit_error_errno(u, fd, "Failed to open file %s: %m", of->path);
4672
4673                 log_unit_debug(u, "file %s opened (fd=%d)", of->path, fd);
4674         }
4675
4676         return TAKE_FD(fd);
4677 }
4678
4679 static int collect_open_file_fds(
4680                 Unit *u,
4681                 OpenFile* open_files,
4682                 int **fds,
4683                 char ***fdnames,
4684                 size_t *n_fds) {
4685         int r;
4686
4687         assert(u);
4688         assert(fds);
4689         assert(fdnames);
4690         assert(n_fds);
4691
4692         LIST_FOREACH(open_files, of, open_files) {
4693                 _cleanup_close_ int fd = -EBADF;
4694
4695                 fd = get_open_file_fd(u, of);
4696                 if (fd < 0) {
4697                         if (FLAGS_SET(of->flags, OPENFILE_GRACEFUL)) {
4698                                 log_unit_debug_errno(u, fd, "Failed to get OpenFile= file descriptor for %s, ignoring: %m", of->path);
4699                                 continue;
4700                         }
4701
4702                         return fd;
4703                 }
4704
4705                 if (!GREEDY_REALLOC(*fds, *n_fds + 1))
4706                         return -ENOMEM;
4707
4708                 r = strv_extend(fdnames, of->fdname);
4709                 if (r < 0)
4710                         return r;
4711
4712                 (*fds)[*n_fds] = TAKE_FD(fd);
4713
4714                 (*n_fds)++;
4715         }
4716
4717         return 0;
4718 }
4719
4720 static void log_command_line(Unit *unit, const char *msg, const char *executable, char **argv) {
4721         assert(unit);
4722         assert(msg);
4723         assert(executable);
4724
4725         if (!DEBUG_LOGGING)
4726                 return;
4727
4728         _cleanup_free_ char *cmdline = quote_command_line(argv, SHELL_ESCAPE_EMPTY);
4729
4730         log_unit_struct(unit, LOG_DEBUG,
4731                         "EXECUTABLE=%s", executable,
4732                         LOG_UNIT_MESSAGE(unit, "%s: %s", msg, strnull(cmdline)),
4733                         LOG_UNIT_INVOCATION_ID(unit));
4734 }
4735
4736 static bool exec_context_need_unprivileged_private_users(
4737                 const ExecContext *context,
4738                 const ExecParameters *params) {
4739
4740         assert(context);
4741         assert(params);
4742
4743         /* These options require PrivateUsers= when used in user units, as we need to be in a user namespace
4744          * to have permission to enable them when not running as root. If we have effective CAP_SYS_ADMIN
4745          * (system manager) then we have privileges and don't need this. */
4746         if (params->runtime_scope != RUNTIME_SCOPE_USER)
4747                 return false;
4748
4749         return context->private_users ||
4750                context->private_tmp ||
4751                context->private_devices ||
4752                context->private_network ||
4753                context->network_namespace_path ||
4754                context->private_ipc ||
4755                context->ipc_namespace_path ||
4756                context->private_mounts > 0 ||
4757                context->mount_apivfs ||
4758                context->n_bind_mounts > 0 ||
4759                context->n_temporary_filesystems > 0 ||
4760                context->root_directory ||
4761                !strv_isempty(context->extension_directories) ||
4762                context->protect_system != PROTECT_SYSTEM_NO ||
4763                context->protect_home != PROTECT_HOME_NO ||
4764                context->protect_kernel_tunables ||
4765                context->protect_kernel_modules ||
4766                context->protect_kernel_logs ||
4767                context->protect_control_groups ||
4768                context->protect_clock ||
4769                context->protect_hostname ||
4770                !strv_isempty(context->read_write_paths) ||
4771                !strv_isempty(context->read_only_paths) ||
4772                !strv_isempty(context->inaccessible_paths) ||
4773                !strv_isempty(context->exec_paths) ||
4774                !strv_isempty(context->no_exec_paths);
4775 }
4776
4777 static int exec_child(
4778                 Unit *unit,
4779                 const ExecCommand *command,
4780                 const ExecContext *context,
4781                 const ExecParameters *params,
4782                 ExecRuntime *runtime,
4783                 const CGroupContext *cgroup_context,
4784                 int socket_fd,
4785                 const int named_iofds[static 3],
4786                 int *params_fds,
4787                 size_t n_socket_fds,
4788                 size_t n_storage_fds,
4789                 char **files_env,
4790                 int user_lookup_fd,
4791                 int *exit_status) {
4792
4793         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
4794         int r, ngids = 0, exec_fd;
4795         _cleanup_free_ gid_t *supplementary_gids = NULL;
4796         const char *username = NULL, *groupname = NULL;
4797         _cleanup_free_ char *home_buffer = NULL, *memory_pressure_path = NULL;
4798         const char *home = NULL, *shell = NULL;
4799         char **final_argv = NULL;
4800         dev_t journal_stream_dev = 0;
4801         ino_t journal_stream_ino = 0;
4802         bool userns_set_up = false;
4803         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
4804                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
4805                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
4806                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
4807 #if HAVE_SELINUX
4808         _cleanup_free_ char *mac_selinux_context_net = NULL;
4809         bool use_selinux = false;
4810 #endif
4811 #if ENABLE_SMACK
4812         bool use_smack = false;
4813 #endif
4814 #if HAVE_APPARMOR
4815         bool use_apparmor = false;
4816 #endif
4817         uid_t saved_uid = getuid();
4818         gid_t saved_gid = getgid();
4819         uid_t uid = UID_INVALID;
4820         gid_t gid = GID_INVALID;
4821         size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
4822                n_keep_fds; /* total number of fds not to close */
4823         int secure_bits;
4824         _cleanup_free_ gid_t *gids_after_pam = NULL;
4825         int ngids_after_pam = 0;
4826         _cleanup_free_ int *fds = NULL;
4827         _cleanup_strv_free_ char **fdnames = NULL;
4828
4829         assert(unit);
4830         assert(command);
4831         assert(context);
4832         assert(params);
4833         assert(exit_status);
4834
4835         /* Explicitly test for CVE-2021-4034 inspired invocations */
4836         assert(command->path);
4837         assert(!strv_isempty(command->argv));
4838
4839         rename_process_from_path(command->path);
4840
4841         /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4842          * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4843          * both of which will be demoted to SIG_DFL. */
4844         (void) default_signals(SIGNALS_CRASH_HANDLER,
4845                                SIGNALS_IGNORE);
4846
4847         if (context->ignore_sigpipe)
4848                 (void) ignore_signals(SIGPIPE);
4849
4850         r = reset_signal_mask();
4851         if (r < 0) {
4852                 *exit_status = EXIT_SIGNAL_MASK;
4853                 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
4854         }
4855
4856         if (params->idle_pipe)
4857                 do_idle_pipe_dance(params->idle_pipe);
4858
4859         /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4860          * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4861          * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4862          * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
4863
4864         log_forget_fds();
4865         log_set_open_when_needed(true);
4866         log_settle_target();
4867
4868         /* In case anything used libc syslog(), close this here, too */
4869         closelog();
4870
4871         fds = newdup(int, params_fds, n_fds);
4872         if (!fds) {
4873                 *exit_status = EXIT_MEMORY;
4874                 return log_oom();
4875         }
4876
4877         fdnames = strv_copy((char**) params->fd_names);
4878         if (!fdnames) {
4879                 *exit_status = EXIT_MEMORY;
4880                 return log_oom();
4881         }
4882
4883         r = collect_open_file_fds(unit, params->open_files, &fds, &fdnames, &n_fds);
4884         if (r < 0) {
4885                 *exit_status = EXIT_FDS;
4886                 return log_unit_error_errno(unit, r, "Failed to get OpenFile= file descriptors: %m");
4887         }
4888
4889         int keep_fds[n_fds + 3];
4890         memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
4891         n_keep_fds = n_fds;
4892
4893         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
4894         if (r < 0) {
4895                 *exit_status = EXIT_FDS;
4896                 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4897         }
4898
4899 #if HAVE_LIBBPF
4900         if (unit->manager->restrict_fs) {
4901                 int bpf_map_fd = lsm_bpf_map_restrict_fs_fd(unit);
4902                 if (bpf_map_fd < 0) {
4903                         *exit_status = EXIT_FDS;
4904                         return log_unit_error_errno(unit, bpf_map_fd, "Failed to get restrict filesystems BPF map fd: %m");
4905                 }
4906
4907                 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, bpf_map_fd, &bpf_map_fd);
4908                 if (r < 0) {
4909                         *exit_status = EXIT_FDS;
4910                         return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4911                 }
4912         }
4913 #endif
4914
4915         r = close_remaining_fds(params, runtime, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
4916         if (r < 0) {
4917                 *exit_status = EXIT_FDS;
4918                 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
4919         }
4920
4921         if (!context->same_pgrp &&
4922             setsid() < 0) {
4923                 *exit_status = EXIT_SETSID;
4924                 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
4925         }
4926
4927         exec_context_tty_reset(context, params);
4928
4929         if (unit_shall_confirm_spawn(unit)) {
4930                 _cleanup_free_ char *cmdline = NULL;
4931
4932                 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
4933                 if (!cmdline) {
4934                         *exit_status = EXIT_MEMORY;
4935                         return log_oom();
4936                 }
4937
4938                 r = ask_for_confirmation(context, params->confirm_spawn, unit, cmdline);
4939                 if (r != CONFIRM_EXECUTE) {
4940                         if (r == CONFIRM_PRETEND_SUCCESS) {
4941                                 *exit_status = EXIT_SUCCESS;
4942                                 return 0;
4943                         }
4944                         *exit_status = EXIT_CONFIRM;
4945                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
4946                                                     "Execution cancelled by the user");
4947                 }
4948         }
4949
4950         /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4951          * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4952          * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4953          * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4954          * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4955         if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
4956             setenv("SYSTEMD_ACTIVATION_SCOPE", runtime_scope_to_string(params->runtime_scope), true) != 0) {
4957                 *exit_status = EXIT_MEMORY;
4958                 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4959         }
4960
4961         if (context->dynamic_user && runtime && runtime->dynamic_creds) {
4962                 _cleanup_strv_free_ char **suggested_paths = NULL;
4963
4964                 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
4965                  * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
4966                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4967                         *exit_status = EXIT_USER;
4968                         return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4969                 }
4970
4971                 r = compile_suggested_paths(context, params, &suggested_paths);
4972                 if (r < 0) {
4973                         *exit_status = EXIT_MEMORY;
4974                         return log_oom();
4975                 }
4976
4977                 r = dynamic_creds_realize(runtime->dynamic_creds, suggested_paths, &uid, &gid);
4978                 if (r < 0) {
4979                         *exit_status = EXIT_USER;
4980                         if (r == -EILSEQ)
4981                                 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4982                                                             "Failed to update dynamic user credentials: User or group with specified name already exists.");
4983                         return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
4984                 }
4985
4986                 if (!uid_is_valid(uid)) {
4987                         *exit_status = EXIT_USER;
4988                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
4989                 }
4990
4991                 if (!gid_is_valid(gid)) {
4992                         *exit_status = EXIT_USER;
4993                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
4994                 }
4995
4996                 if (runtime->dynamic_creds->user)
4997                         username = runtime->dynamic_creds->user->name;
4998
4999         } else {
5000                 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
5001                 if (r < 0) {
5002                         *exit_status = EXIT_USER;
5003                         return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
5004                 }
5005
5006                 r = get_fixed_group(context, &groupname, &gid);
5007                 if (r < 0) {
5008                         *exit_status = EXIT_GROUP;
5009                         return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
5010                 }
5011         }
5012
5013         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
5014         r = get_supplementary_groups(context, username, groupname, gid,
5015                                      &supplementary_gids, &ngids);
5016         if (r < 0) {
5017                 *exit_status = EXIT_GROUP;
5018                 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
5019         }
5020
5021         r = send_user_lookup(unit, user_lookup_fd, uid, gid);
5022         if (r < 0) {
5023                 *exit_status = EXIT_USER;
5024                 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
5025         }
5026
5027         user_lookup_fd = safe_close(user_lookup_fd);
5028
5029         r = acquire_home(context, uid, &home, &home_buffer);
5030         if (r < 0) {
5031                 *exit_status = EXIT_CHDIR;
5032                 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
5033         }
5034
5035         /* If a socket is connected to STDIN/STDOUT/STDERR, we must drop O_NONBLOCK */
5036         if (socket_fd >= 0)
5037                 (void) fd_nonblock(socket_fd, false);
5038
5039         /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
5040          * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
5041         if (params->cgroup_path) {
5042                 _cleanup_free_ char *p = NULL;
5043
5044                 r = exec_parameters_get_cgroup_path(params, cgroup_context, &p);
5045                 if (r < 0) {
5046                         *exit_status = EXIT_CGROUP;
5047                         return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
5048                 }
5049
5050                 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
5051                 if (r == -EUCLEAN) {
5052                         *exit_status = EXIT_CGROUP;
5053                         return log_unit_error_errno(unit, r, "Failed to attach process to cgroup %s "
5054                                                     "because the cgroup or one of its parents or "
5055                                                     "siblings is in the threaded mode: %m", p);
5056                 }
5057                 if (r < 0) {
5058                         *exit_status = EXIT_CGROUP;
5059                         return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
5060                 }
5061         }
5062
5063         if (context->network_namespace_path && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
5064                 r = open_shareable_ns_path(runtime->shared->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
5065                 if (r < 0) {
5066                         *exit_status = EXIT_NETWORK;
5067                         return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
5068                 }
5069         }
5070
5071         if (context->ipc_namespace_path && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
5072                 r = open_shareable_ns_path(runtime->shared->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
5073                 if (r < 0) {
5074                         *exit_status = EXIT_NAMESPACE;
5075                         return log_unit_error_errno(unit, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
5076                 }
5077         }
5078
5079         r = setup_input(context, params, socket_fd, named_iofds);
5080         if (r < 0) {
5081                 *exit_status = EXIT_STDIN;
5082                 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
5083         }
5084
5085         r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
5086         if (r < 0) {
5087                 *exit_status = EXIT_STDOUT;
5088                 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
5089         }
5090
5091         r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
5092         if (r < 0) {
5093                 *exit_status = EXIT_STDERR;
5094                 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
5095         }
5096
5097         if (context->oom_score_adjust_set) {
5098                 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
5099                  * prohibit write access to this file, and we shouldn't trip up over that. */
5100                 r = set_oom_score_adjust(context->oom_score_adjust);
5101                 if (ERRNO_IS_PRIVILEGE(r))
5102                         log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
5103                 else if (r < 0) {
5104                         *exit_status = EXIT_OOM_ADJUST;
5105                         return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
5106                 }
5107         }
5108
5109         if (context->coredump_filter_set) {
5110                 r = set_coredump_filter(context->coredump_filter);
5111                 if (ERRNO_IS_PRIVILEGE(r))
5112                         log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
5113                 else if (r < 0)
5114                         return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
5115         }
5116
5117         if (context->nice_set) {
5118                 r = setpriority_closest(context->nice);
5119                 if (r < 0)
5120                         return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
5121         }
5122
5123         if (context->cpu_sched_set) {
5124                 struct sched_param param = {
5125                         .sched_priority = context->cpu_sched_priority,
5126                 };
5127
5128                 r = sched_setscheduler(0,
5129                                        context->cpu_sched_policy |
5130                                        (context->cpu_sched_reset_on_fork ?
5131                                         SCHED_RESET_ON_FORK : 0),
5132                                        &param);
5133                 if (r < 0) {
5134                         *exit_status = EXIT_SETSCHEDULER;
5135                         return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
5136                 }
5137         }
5138
5139         if (context->cpu_affinity_from_numa || context->cpu_set.set) {
5140                 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
5141                 const CPUSet *cpu_set;
5142
5143                 if (context->cpu_affinity_from_numa) {
5144                         r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
5145                         if (r < 0) {
5146                                 *exit_status = EXIT_CPUAFFINITY;
5147                                 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
5148                         }
5149
5150                         cpu_set = &converted_cpu_set;
5151                 } else
5152                         cpu_set = &context->cpu_set;
5153
5154                 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
5155                         *exit_status = EXIT_CPUAFFINITY;
5156                         return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
5157                 }
5158         }
5159
5160         if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
5161                 r = apply_numa_policy(&context->numa_policy);
5162                 if (r < 0) {
5163                         if (ERRNO_IS_NOT_SUPPORTED(r))
5164                                 log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
5165                         else {
5166                                 *exit_status = EXIT_NUMA_POLICY;
5167                                 return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
5168                         }
5169                 }
5170         }
5171
5172         if (context->ioprio_set)
5173                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
5174                         *exit_status = EXIT_IOPRIO;
5175                         return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
5176                 }
5177
5178         if (context->timer_slack_nsec != NSEC_INFINITY)
5179                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
5180                         *exit_status = EXIT_TIMERSLACK;
5181                         return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
5182                 }
5183
5184         if (context->personality != PERSONALITY_INVALID) {
5185                 r = safe_personality(context->personality);
5186                 if (r < 0) {
5187                         *exit_status = EXIT_PERSONALITY;
5188                         return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
5189                 }
5190         }
5191
5192         if (context->utmp_id) {
5193                 const char *line = context->tty_path ?
5194                         (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
5195                         NULL;
5196                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
5197                                       line,
5198                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
5199                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
5200                                       USER_PROCESS,
5201                                       username);
5202         }
5203
5204         if (uid_is_valid(uid)) {
5205                 r = chown_terminal(STDIN_FILENO, uid);
5206                 if (r < 0) {
5207                         *exit_status = EXIT_STDIN;
5208                         return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
5209                 }
5210         }
5211
5212         if (params->cgroup_path) {
5213                 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
5214                  * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
5215                  * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
5216                  * touch a single hierarchy too. */
5217
5218                 if (params->flags & EXEC_CGROUP_DELEGATE) {
5219                         _cleanup_free_ char *p = NULL;
5220
5221                         r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
5222                         if (r < 0) {
5223                                 *exit_status = EXIT_CGROUP;
5224                                 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
5225                         }
5226
5227                         r = exec_parameters_get_cgroup_path(params, cgroup_context, &p);
5228                         if (r < 0) {
5229                                 *exit_status = EXIT_CGROUP;
5230                                 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
5231                         }
5232                         if (r > 0) {
5233                                 r = cg_set_access_recursive(SYSTEMD_CGROUP_CONTROLLER, p, uid, gid);
5234                                 if (r < 0) {
5235                                         *exit_status = EXIT_CGROUP;
5236                                         return log_unit_error_errno(unit, r, "Failed to adjust control subgroup access: %m");
5237                                 }
5238                         }
5239                 }
5240
5241                 if (cgroup_context && cg_unified() > 0 && is_pressure_supported() > 0) {
5242                         if (cgroup_context_want_memory_pressure(cgroup_context)) {
5243                                 r = cg_get_path("memory", params->cgroup_path, "memory.pressure", &memory_pressure_path);
5244                                 if (r < 0) {
5245                                         *exit_status = EXIT_MEMORY;
5246                                         return log_oom();
5247                                 }
5248
5249                                 r = chmod_and_chown(memory_pressure_path, 0644, uid, gid);
5250                                 if (r < 0) {
5251                                         log_unit_full_errno(unit, r == -ENOENT || ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
5252                                                             "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path);
5253                                         memory_pressure_path = mfree(memory_pressure_path);
5254                                 }
5255                         } else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_OFF) {
5256                                 memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
5257                                 if (!memory_pressure_path) {
5258                                         *exit_status = EXIT_MEMORY;
5259                                         return log_oom();
5260                                 }
5261                         }
5262                 }
5263         }
5264
5265         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
5266
5267         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
5268                 r = setup_exec_directory(unit, context, params, uid, gid, dt, needs_mount_namespace, exit_status);
5269                 if (r < 0)
5270                         return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
5271         }
5272
5273         if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
5274                 r = setup_credentials(context, params, unit->id, uid);
5275                 if (r < 0) {
5276                         *exit_status = EXIT_CREDENTIALS;
5277                         return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
5278                 }
5279         }
5280
5281         r = build_environment(
5282                         unit,
5283                         context,
5284                         params,
5285                         cgroup_context,
5286                         n_fds,
5287                         fdnames,
5288                         home,
5289                         username,
5290                         shell,
5291                         journal_stream_dev,
5292                         journal_stream_ino,
5293                         memory_pressure_path,
5294                         &our_env);
5295         if (r < 0) {
5296                 *exit_status = EXIT_MEMORY;
5297                 return log_oom();
5298         }
5299
5300         r = build_pass_environment(context, &pass_env);
5301         if (r < 0) {
5302                 *exit_status = EXIT_MEMORY;
5303                 return log_oom();
5304         }
5305
5306         /* The $PATH variable is set to the default path in params->environment. However, this is overridden
5307          * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
5308          * not specify PATH but the unit has ExecSearchPath. */
5309         if (!strv_isempty(context->exec_search_path)) {
5310                 _cleanup_free_ char *joined = NULL;
5311
5312                 joined = strv_join(context->exec_search_path, ":");
5313                 if (!joined) {
5314                         *exit_status = EXIT_MEMORY;
5315                         return log_oom();
5316                 }
5317
5318                 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
5319                 if (r < 0) {
5320                         *exit_status = EXIT_MEMORY;
5321                         return log_oom();
5322                 }
5323         }
5324
5325         accum_env = strv_env_merge(params->environment,
5326                                    our_env,
5327                                    joined_exec_search_path,
5328                                    pass_env,
5329                                    context->environment,
5330                                    files_env);
5331         if (!accum_env) {
5332                 *exit_status = EXIT_MEMORY;
5333                 return log_oom();
5334         }
5335         accum_env = strv_env_clean(accum_env);
5336
5337         (void) umask(context->umask);
5338
5339         r = setup_keyring(unit, context, params, uid, gid);
5340         if (r < 0) {
5341                 *exit_status = EXIT_KEYRING;
5342                 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
5343         }
5344
5345         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
5346          * from it. */
5347         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
5348
5349         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
5350          * for it, and the kernel doesn't actually support ambient caps. */
5351         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
5352
5353         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
5354          * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
5355          * desired. */
5356         if (needs_ambient_hack)
5357                 needs_setuid = false;
5358         else
5359                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
5360
5361         uint64_t capability_ambient_set = context->capability_ambient_set;
5362
5363         if (needs_sandboxing) {
5364                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
5365                  * /sys being present. The actual MAC context application will happen later, as late as
5366                  * possible, to avoid impacting our own code paths. */
5367
5368 #if HAVE_SELINUX
5369                 use_selinux = mac_selinux_use();
5370 #endif
5371 #if ENABLE_SMACK
5372                 use_smack = mac_smack_use();
5373 #endif
5374 #if HAVE_APPARMOR
5375                 use_apparmor = mac_apparmor_use();
5376 #endif
5377         }
5378
5379         if (needs_sandboxing) {
5380                 int which_failed;
5381
5382                 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
5383                  * is set here. (See below.) */
5384
5385                 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
5386                 if (r < 0) {
5387                         *exit_status = EXIT_LIMITS;
5388                         return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
5389                 }
5390         }
5391
5392         if (needs_setuid && context->pam_name && username) {
5393                 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
5394                  * wins here. (See above.) */
5395
5396                 /* All fds passed in the fds array will be closed in the pam child process. */
5397                 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
5398                 if (r < 0) {
5399                         *exit_status = EXIT_PAM;
5400                         return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
5401                 }
5402
5403                 if (ambient_capabilities_supported()) {
5404                         uint64_t ambient_after_pam;
5405
5406                         /* PAM modules might have set some ambient caps. Query them here and merge them into
5407                          * the caps we want to set in the end, so that we don't end up unsetting them. */
5408                         r = capability_get_ambient(&ambient_after_pam);
5409                         if (r < 0) {
5410                                 *exit_status = EXIT_CAPABILITIES;
5411                                 return log_unit_error_errno(unit, r, "Failed to query ambient caps: %m");
5412                         }
5413
5414                         capability_ambient_set |= ambient_after_pam;
5415                 }
5416
5417                 ngids_after_pam = getgroups_alloc(&gids_after_pam);
5418                 if (ngids_after_pam < 0) {
5419                         *exit_status = EXIT_MEMORY;
5420                         return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
5421                 }
5422         }
5423
5424         if (needs_sandboxing && exec_context_need_unprivileged_private_users(context, params)) {
5425                 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
5426                  * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
5427                  * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
5428
5429                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
5430                 /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
5431                  * the actual requested operations fail (or silently continue). */
5432                 if (r < 0 && context->private_users) {
5433                         *exit_status = EXIT_USER;
5434                         return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
5435                 }
5436                 if (r < 0)
5437                         log_unit_info_errno(unit, r, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
5438                 else
5439                         userns_set_up = true;
5440         }
5441
5442         if (exec_needs_network_namespace(context) && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
5443
5444                 /* Try to enable network namespacing if network namespacing is available and we have
5445                  * CAP_NET_ADMIN. We need CAP_NET_ADMIN to be able to configure the loopback device in the
5446                  * new network namespace. And if we don't have that, then we could only create a network
5447                  * namespace without the ability to set up "lo". Hence gracefully skip things then. */
5448                 if (ns_type_supported(NAMESPACE_NET) && have_effective_cap(CAP_NET_ADMIN) > 0) {
5449                         r = setup_shareable_ns(runtime->shared->netns_storage_socket, CLONE_NEWNET);
5450                         if (r < 0) {
5451                                 if (ERRNO_IS_PRIVILEGE(r))
5452                                         log_unit_notice_errno(unit, r,
5453                                                                "PrivateNetwork=yes is configured, but network namespace setup not permitted, proceeding without: %m");
5454                                 else {
5455                                         *exit_status = EXIT_NETWORK;
5456                                         return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
5457                                 }
5458                         }
5459                 } else if (context->network_namespace_path) {
5460                         *exit_status = EXIT_NETWORK;
5461                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
5462                                                     "NetworkNamespacePath= is not supported, refusing.");
5463                 } else
5464                         log_unit_notice(unit, "PrivateNetwork=yes is configured, but the kernel does not support or we lack privileges for network namespace, proceeding without.");
5465         }
5466
5467         if (exec_needs_ipc_namespace(context) && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
5468
5469                 if (ns_type_supported(NAMESPACE_IPC)) {
5470                         r = setup_shareable_ns(runtime->shared->ipcns_storage_socket, CLONE_NEWIPC);
5471                         if (r == -EPERM)
5472                                 log_unit_warning_errno(unit, r,
5473                                                        "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
5474                         else if (r < 0) {
5475                                 *exit_status = EXIT_NAMESPACE;
5476                                 return log_unit_error_errno(unit, r, "Failed to set up IPC namespacing: %m");
5477                         }
5478                 } else if (context->ipc_namespace_path) {
5479                         *exit_status = EXIT_NAMESPACE;
5480                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
5481                                                     "IPCNamespacePath= is not supported, refusing.");
5482                 } else
5483                         log_unit_warning(unit, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
5484         }
5485
5486         if (needs_mount_namespace) {
5487                 _cleanup_free_ char *error_path = NULL;
5488
5489                 r = apply_mount_namespace(unit, command->flags, context, params, runtime, memory_pressure_path, &error_path);
5490                 if (r < 0) {
5491                         *exit_status = EXIT_NAMESPACE;
5492                         return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
5493                                                     error_path ? ": " : "", strempty(error_path));
5494                 }
5495         }
5496
5497         if (needs_sandboxing) {
5498                 r = apply_protect_hostname(unit, context, exit_status);
5499                 if (r < 0)
5500                         return r;
5501         }
5502
5503         if (context->memory_ksm >= 0)
5504                 if (prctl(PR_SET_MEMORY_MERGE, context->memory_ksm) < 0) {
5505                         if (ERRNO_IS_NOT_SUPPORTED(errno))
5506                                 log_unit_debug_errno(unit, errno, "KSM support not available, ignoring.");
5507                         else {
5508                                 *exit_status = EXIT_KSM;
5509                                 return log_unit_error_errno(unit, errno, "Failed to set KSM: %m");
5510                         }
5511                 }
5512
5513         /* Drop groups as early as possible.
5514          * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
5515          * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
5516         if (needs_setuid) {
5517                 _cleanup_free_ gid_t *gids_to_enforce = NULL;
5518                 int ngids_to_enforce = 0;
5519
5520                 ngids_to_enforce = merge_gid_lists(supplementary_gids,
5521                                                    ngids,
5522                                                    gids_after_pam,
5523                                                    ngids_after_pam,
5524                                                    &gids_to_enforce);
5525                 if (ngids_to_enforce < 0) {
5526                         *exit_status = EXIT_MEMORY;
5527                         return log_unit_error_errno(unit,
5528                                                     ngids_to_enforce,
5529                                                     "Failed to merge group lists. Group membership might be incorrect: %m");
5530                 }
5531
5532                 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
5533                 if (r < 0) {
5534                         *exit_status = EXIT_GROUP;
5535                         return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
5536                 }
5537         }
5538
5539         /* If the user namespace was not set up above, try to do it now.
5540          * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
5541          * restricted by rules pertaining to combining user namespaces with other namespaces (e.g. in the
5542          * case of mount namespaces being less privileged when the mount point list is copied from a
5543          * different user namespace). */
5544
5545         if (needs_sandboxing && context->private_users && !userns_set_up) {
5546                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
5547                 if (r < 0) {
5548                         *exit_status = EXIT_USER;
5549                         return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
5550                 }
5551         }
5552
5553         /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
5554          * shall execute. */
5555
5556         _cleanup_free_ char *executable = NULL;
5557         _cleanup_close_ int executable_fd = -EBADF;
5558         r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
5559         if (r < 0) {
5560                 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
5561                         log_unit_struct_errno(unit, LOG_INFO, r,
5562                                               "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5563                                               LOG_UNIT_INVOCATION_ID(unit),
5564                                               LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
5565                                                                command->path),
5566                                               "EXECUTABLE=%s", command->path);
5567                         return 0;
5568                 }
5569
5570                 *exit_status = EXIT_EXEC;
5571
5572                 return log_unit_struct_errno(unit, LOG_INFO, r,
5573                                              "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5574                                              LOG_UNIT_INVOCATION_ID(unit),
5575                                              LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
5576                                                               command->path),
5577                                              "EXECUTABLE=%s", command->path);
5578         }
5579
5580         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
5581         if (r < 0) {
5582                 *exit_status = EXIT_FDS;
5583                 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
5584         }
5585
5586 #if HAVE_SELINUX
5587         if (needs_sandboxing && use_selinux && params->selinux_context_net) {
5588                 int fd = -EBADF;
5589
5590                 if (socket_fd >= 0)
5591                         fd = socket_fd;
5592                 else if (params->n_socket_fds == 1)
5593                         /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
5594                          * use context from that fd to compute the label. */
5595                         fd = params->fds[0];
5596
5597                 if (fd >= 0) {
5598                         r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
5599                         if (r < 0) {
5600                                 if (!context->selinux_context_ignore) {
5601                                         *exit_status = EXIT_SELINUX_CONTEXT;
5602                                         return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
5603                                 }
5604                                 log_unit_debug_errno(unit, r, "Failed to determine SELinux context, ignoring: %m");
5605                         }
5606                 }
5607         }
5608 #endif
5609
5610         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that
5611          * we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any
5612          * more. We do keep exec_fd however, if we have it, since we need to keep it open until the final
5613          * execve(). */
5614
5615         r = close_all_fds(keep_fds, n_keep_fds);
5616         if (r >= 0)
5617                 r = shift_fds(fds, n_fds);
5618         if (r >= 0)
5619                 r = flags_fds(fds, n_socket_fds, n_fds, context->non_blocking);
5620         if (r < 0) {
5621                 *exit_status = EXIT_FDS;
5622                 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
5623         }
5624
5625         /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
5626          * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
5627          * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
5628          * came this far. */
5629
5630         secure_bits = context->secure_bits;
5631
5632         if (needs_sandboxing) {
5633                 uint64_t bset;
5634
5635                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested.
5636                  * (Note this is placed after the general resource limit initialization, see above, in order
5637                  * to take precedence.) */
5638                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
5639                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
5640                                 *exit_status = EXIT_LIMITS;
5641                                 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
5642                         }
5643                 }
5644
5645 #if ENABLE_SMACK
5646                 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
5647                  * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
5648                 if (use_smack) {
5649                         r = setup_smack(unit->manager, context, executable_fd);
5650                         if (r < 0 && !context->smack_process_label_ignore) {
5651                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
5652                                 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
5653                         }
5654                 }
5655 #endif
5656
5657                 bset = context->capability_bounding_set;
5658                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
5659                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
5660                  * instead of us doing that */
5661                 if (needs_ambient_hack)
5662                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
5663                                 (UINT64_C(1) << CAP_SETUID) |
5664                                 (UINT64_C(1) << CAP_SETGID);
5665
5666                 if (!cap_test_all(bset)) {
5667                         r = capability_bounding_set_drop(bset, /* right_now= */ false);
5668                         if (r < 0) {
5669                                 *exit_status = EXIT_CAPABILITIES;
5670                                 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
5671                         }
5672                 }
5673
5674                 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
5675                  * keep-caps set.
5676                  *
5677                  * To be able to raise the ambient capabilities after setresuid() they have to be added to
5678                  * the inherited set and keep caps has to be set (done in enforce_user()).  After setresuid()
5679                  * the ambient capabilities can be raised as they are present in the permitted and
5680                  * inhertiable set. However it is possible that someone wants to set ambient capabilities
5681                  * without changing the user, so we also set the ambient capabilities here.
5682                  *
5683                  * The requested ambient capabilities are raised in the inheritable set if the second
5684                  * argument is true. */
5685                 if (!needs_ambient_hack) {
5686                         r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ true);
5687                         if (r < 0) {
5688                                 *exit_status = EXIT_CAPABILITIES;
5689                                 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
5690                         }
5691                 }
5692         }
5693
5694         /* chroot to root directory first, before we lose the ability to chroot */
5695         r = apply_root_directory(context, params, runtime, needs_mount_namespace, exit_status);
5696         if (r < 0)
5697                 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
5698
5699         if (needs_setuid) {
5700                 if (uid_is_valid(uid)) {
5701                         r = enforce_user(context, uid, capability_ambient_set);
5702                         if (r < 0) {
5703                                 *exit_status = EXIT_USER;
5704                                 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
5705                         }
5706
5707                         if (!needs_ambient_hack && capability_ambient_set != 0) {
5708
5709                                 /* Raise the ambient capabilities after user change. */
5710                                 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ false);
5711                                 if (r < 0) {
5712                                         *exit_status = EXIT_CAPABILITIES;
5713                                         return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
5714                                 }
5715                         }
5716                 }
5717         }
5718
5719         /* Apply working directory here, because the working directory might be on NFS and only the user running
5720          * this service might have the correct privilege to change to the working directory */
5721         r = apply_working_directory(context, params, runtime, home, exit_status);
5722         if (r < 0)
5723                 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
5724
5725         if (needs_sandboxing) {
5726                 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
5727                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
5728                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
5729                  * are restricted. */
5730
5731 #if HAVE_SELINUX
5732                 if (use_selinux) {
5733                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
5734
5735                         if (exec_context) {
5736                                 r = setexeccon(exec_context);
5737                                 if (r < 0) {
5738                                         if (!context->selinux_context_ignore) {
5739                                                 *exit_status = EXIT_SELINUX_CONTEXT;
5740                                                 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
5741                                         }
5742                                         log_unit_debug_errno(unit, r, "Failed to change SELinux context to %s, ignoring: %m", exec_context);
5743                                 }
5744                         }
5745                 }
5746 #endif
5747
5748 #if HAVE_APPARMOR
5749                 if (use_apparmor && context->apparmor_profile) {
5750                         r = aa_change_onexec(context->apparmor_profile);
5751                         if (r < 0 && !context->apparmor_profile_ignore) {
5752                                 *exit_status = EXIT_APPARMOR_PROFILE;
5753                                 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
5754                         }
5755                 }
5756 #endif
5757
5758                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential
5759                  * EPERMs we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits
5760                  * requires CAP_SETPCAP. */
5761                 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
5762                         /* CAP_SETPCAP is required to set securebits. This capability is raised into the
5763                          * effective set here.
5764                          *
5765                          * The effective set is overwritten during execve() with the following values:
5766                          *
5767                          * - ambient set (for non-root processes)
5768                          *
5769                          * - (inheritable | bounding) set for root processes)
5770                          *
5771                          * Hence there is no security impact to raise it in the effective set before execve
5772                          */
5773                         r = capability_gain_cap_setpcap(/* return_caps= */ NULL);
5774                         if (r < 0) {
5775                                 *exit_status = EXIT_CAPABILITIES;
5776                                 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
5777                         }
5778                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
5779                                 *exit_status = EXIT_SECUREBITS;
5780                                 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
5781                         }
5782                 }
5783
5784                 if (context_has_no_new_privileges(context))
5785                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
5786                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
5787                                 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
5788                         }
5789
5790 #if HAVE_SECCOMP
5791                 r = apply_address_families(unit, context);
5792                 if (r < 0) {
5793                         *exit_status = EXIT_ADDRESS_FAMILIES;
5794                         return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
5795                 }
5796
5797                 r = apply_memory_deny_write_execute(unit, context);
5798                 if (r < 0) {
5799                         *exit_status = EXIT_SECCOMP;
5800                         return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
5801                 }
5802
5803                 r = apply_restrict_realtime(unit, context);
5804                 if (r < 0) {
5805                         *exit_status = EXIT_SECCOMP;
5806                         return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
5807                 }
5808
5809                 r = apply_restrict_suid_sgid(unit, context);
5810                 if (r < 0) {
5811                         *exit_status = EXIT_SECCOMP;
5812                         return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
5813                 }
5814
5815                 r = apply_restrict_namespaces(unit, context);
5816                 if (r < 0) {
5817                         *exit_status = EXIT_SECCOMP;
5818                         return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
5819                 }
5820
5821                 r = apply_protect_sysctl(unit, context);
5822                 if (r < 0) {
5823                         *exit_status = EXIT_SECCOMP;
5824                         return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
5825                 }
5826
5827                 r = apply_protect_kernel_modules(unit, context);
5828                 if (r < 0) {
5829                         *exit_status = EXIT_SECCOMP;
5830                         return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
5831                 }
5832
5833                 r = apply_protect_kernel_logs(unit, context);
5834                 if (r < 0) {
5835                         *exit_status = EXIT_SECCOMP;
5836                         return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
5837                 }
5838
5839                 r = apply_protect_clock(unit, context);
5840                 if (r < 0) {
5841                         *exit_status = EXIT_SECCOMP;
5842                         return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
5843                 }
5844
5845                 r = apply_private_devices(unit, context);
5846                 if (r < 0) {
5847                         *exit_status = EXIT_SECCOMP;
5848                         return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
5849                 }
5850
5851                 r = apply_syscall_archs(unit, context);
5852                 if (r < 0) {
5853                         *exit_status = EXIT_SECCOMP;
5854                         return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
5855                 }
5856
5857                 r = apply_lock_personality(unit, context);
5858                 if (r < 0) {
5859                         *exit_status = EXIT_SECCOMP;
5860                         return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
5861                 }
5862
5863                 r = apply_syscall_log(unit, context);
5864                 if (r < 0) {
5865                         *exit_status = EXIT_SECCOMP;
5866                         return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
5867                 }
5868
5869                 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
5870                  * by the filter as little as possible. */
5871                 r = apply_syscall_filter(unit, context, needs_ambient_hack);
5872                 if (r < 0) {
5873                         *exit_status = EXIT_SECCOMP;
5874                         return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
5875                 }
5876 #endif
5877
5878 #if HAVE_LIBBPF
5879                 r = apply_restrict_filesystems(unit, context);
5880                 if (r < 0) {
5881                         *exit_status = EXIT_BPF;
5882                         return log_unit_error_errno(unit, r, "Failed to restrict filesystems: %m");
5883                 }
5884 #endif
5885
5886         }
5887
5888         if (!strv_isempty(context->unset_environment)) {
5889                 char **ee = NULL;
5890
5891                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
5892                 if (!ee) {
5893                         *exit_status = EXIT_MEMORY;
5894                         return log_oom();
5895                 }
5896
5897                 strv_free_and_replace(accum_env, ee);
5898         }
5899
5900         if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
5901                 _cleanup_strv_free_ char **unset_variables = NULL, **bad_variables = NULL;
5902
5903                 r = replace_env_argv(command->argv, accum_env, &replaced_argv, &unset_variables, &bad_variables);
5904                 if (r < 0) {
5905                         *exit_status = EXIT_MEMORY;
5906                         return log_unit_error_errno(unit, r, "Failed to replace environment variables: %m");
5907                 }
5908                 final_argv = replaced_argv;
5909
5910                 if (!strv_isempty(unset_variables)) {
5911                         _cleanup_free_ char *ju = strv_join(unset_variables, ", ");
5912                         log_unit_warning(unit, "Referenced but unset environment variable evaluates to an empty string: %s", strna(ju));
5913                 }
5914
5915                 if (!strv_isempty(bad_variables)) {
5916                         _cleanup_free_ char *jb = strv_join(bad_variables, ", ");
5917                         log_unit_warning(unit, "Invalid environment variable name evaluates to an empty string: %s", strna(jb));;
5918                 }
5919         } else
5920                 final_argv = command->argv;
5921
5922         log_command_line(unit, "Executing", executable, final_argv);
5923
5924         if (exec_fd >= 0) {
5925                 uint8_t hot = 1;
5926
5927                 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5928                  * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5929
5930                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5931                         *exit_status = EXIT_EXEC;
5932                         return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
5933                 }
5934         }
5935
5936         r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5937
5938         if (exec_fd >= 0) {
5939                 uint8_t hot = 0;
5940
5941                 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5942                  * that POLLHUP on it no longer means execve() succeeded. */
5943
5944                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5945                         *exit_status = EXIT_EXEC;
5946                         return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
5947                 }
5948         }
5949
5950         *exit_status = EXIT_EXEC;
5951         return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
5952 }
5953
5954 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
5955 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
5956
5957 int exec_spawn(Unit *unit,
5958                ExecCommand *command,
5959                const ExecContext *context,
5960                const ExecParameters *params,
5961                ExecRuntime *runtime,
5962                const CGroupContext *cgroup_context,
5963                pid_t *ret) {
5964
5965         int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
5966         _cleanup_free_ char *subcgroup_path = NULL;
5967         _cleanup_strv_free_ char **files_env = NULL;
5968         size_t n_storage_fds = 0, n_socket_fds = 0;
5969         pid_t pid;
5970
5971         assert(unit);
5972         assert(command);
5973         assert(context);
5974         assert(ret);
5975         assert(params);
5976         assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
5977
5978         LOG_CONTEXT_PUSH_UNIT(unit);
5979
5980         if (context->std_input == EXEC_INPUT_SOCKET ||
5981             context->std_output == EXEC_OUTPUT_SOCKET ||
5982             context->std_error == EXEC_OUTPUT_SOCKET) {
5983
5984                 if (params->n_socket_fds > 1)
5985                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
5986
5987                 if (params->n_socket_fds == 0)
5988                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
5989
5990                 socket_fd = params->fds[0];
5991         } else {
5992                 socket_fd = -EBADF;
5993                 fds = params->fds;
5994                 n_socket_fds = params->n_socket_fds;
5995                 n_storage_fds = params->n_storage_fds;
5996         }
5997
5998         r = exec_context_named_iofds(context, params, named_iofds);
5999         if (r < 0)
6000                 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
6001
6002         r = exec_context_load_environment(unit, context, &files_env);
6003         if (r < 0)
6004                 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
6005
6006         /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
6007            and, until the next SELinux policy changes, we save further reloads in future children. */
6008         mac_selinux_maybe_reload();
6009
6010         /* We won't know the real executable path until we create the mount namespace in the child, but we
6011            want to log from the parent, so we use the possibly inaccurate path here. */
6012         log_command_line(unit, "About to execute", command->path, command->argv);
6013
6014         if (params->cgroup_path) {
6015                 r = exec_parameters_get_cgroup_path(params, cgroup_context, &subcgroup_path);
6016                 if (r < 0)
6017                         return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
6018                 if (r > 0) {
6019                         /* If there's a subcgroup, then let's create it here now (the main cgroup was already
6020                          * realized by the unit logic) */
6021
6022                         r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
6023                         if (r < 0)
6024                                 return log_unit_error_errno(unit, r, "Failed to create subcgroup '%s': %m", subcgroup_path);
6025                 }
6026         }
6027
6028         pid = fork();
6029         if (pid < 0)
6030                 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
6031
6032         if (pid == 0) {
6033                 int exit_status = EXIT_SUCCESS;
6034
6035                 r = exec_child(unit,
6036                                command,
6037                                context,
6038                                params,
6039                                runtime,
6040                                cgroup_context,
6041                                socket_fd,
6042                                named_iofds,
6043                                fds,
6044                                n_socket_fds,
6045                                n_storage_fds,
6046                                files_env,
6047                                unit->manager->user_lookup_fds[1],
6048                                &exit_status);
6049
6050                 if (r < 0) {
6051                         const char *status =
6052                                 exit_status_to_string(exit_status,
6053                                                       EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
6054
6055                         log_unit_struct_errno(unit, LOG_ERR, r,
6056                                               "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
6057                                               LOG_UNIT_INVOCATION_ID(unit),
6058                                               LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
6059                                                                status, command->path),
6060                                               "EXECUTABLE=%s", command->path);
6061                 }
6062
6063                 _exit(exit_status);
6064         }
6065
6066         log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
6067
6068         /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
6069          * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
6070          * process will be killed too). */
6071         if (subcgroup_path)
6072                 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
6073
6074         exec_status_start(&command->exec_status, pid);
6075
6076         *ret = pid;
6077         return 0;
6078 }
6079
6080 void exec_context_init(ExecContext *c) {
6081         assert(c);
6082
6083         c->umask = 0022;
6084         c->ioprio = IOPRIO_DEFAULT_CLASS_AND_PRIO;
6085         c->cpu_sched_policy = SCHED_OTHER;
6086         c->syslog_priority = LOG_DAEMON|LOG_INFO;
6087         c->syslog_level_prefix = true;
6088         c->ignore_sigpipe = true;
6089         c->timer_slack_nsec = NSEC_INFINITY;
6090         c->personality = PERSONALITY_INVALID;
6091         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
6092                 c->directories[t].mode = 0755;
6093         c->timeout_clean_usec = USEC_INFINITY;
6094         c->capability_bounding_set = CAP_MASK_UNSET;
6095         assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
6096         c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
6097         c->log_level_max = -1;
6098 #if HAVE_SECCOMP
6099         c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL;
6100 #endif
6101         c->tty_rows = UINT_MAX;
6102         c->tty_cols = UINT_MAX;
6103         numa_policy_reset(&c->numa_policy);
6104         c->private_mounts = -1;
6105         c->memory_ksm = -1;
6106 }
6107
6108 void exec_context_done(ExecContext *c) {
6109         assert(c);
6110
6111         c->environment = strv_free(c->environment);
6112         c->environment_files = strv_free(c->environment_files);
6113         c->pass_environment = strv_free(c->pass_environment);
6114         c->unset_environment = strv_free(c->unset_environment);
6115
6116         rlimit_free_all(c->rlimit);
6117
6118         for (size_t l = 0; l < 3; l++) {
6119                 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
6120                 c->stdio_file[l] = mfree(c->stdio_file[l]);
6121         }
6122
6123         c->working_directory = mfree(c->working_directory);
6124         c->root_directory = mfree(c->root_directory);
6125         c->root_image = mfree(c->root_image);
6126         c->root_image_options = mount_options_free_all(c->root_image_options);
6127         c->root_hash = mfree(c->root_hash);
6128         c->root_hash_size = 0;
6129         c->root_hash_path = mfree(c->root_hash_path);
6130         c->root_hash_sig = mfree(c->root_hash_sig);
6131         c->root_hash_sig_size = 0;
6132         c->root_hash_sig_path = mfree(c->root_hash_sig_path);
6133         c->root_verity = mfree(c->root_verity);
6134         c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
6135         c->extension_directories = strv_free(c->extension_directories);
6136         c->tty_path = mfree(c->tty_path);
6137         c->syslog_identifier = mfree(c->syslog_identifier);
6138         c->user = mfree(c->user);
6139         c->group = mfree(c->group);
6140
6141         c->supplementary_groups = strv_free(c->supplementary_groups);
6142
6143         c->pam_name = mfree(c->pam_name);
6144
6145         c->read_only_paths = strv_free(c->read_only_paths);
6146         c->read_write_paths = strv_free(c->read_write_paths);
6147         c->inaccessible_paths = strv_free(c->inaccessible_paths);
6148         c->exec_paths = strv_free(c->exec_paths);
6149         c->no_exec_paths = strv_free(c->no_exec_paths);
6150         c->exec_search_path = strv_free(c->exec_search_path);
6151
6152         bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
6153         c->bind_mounts = NULL;
6154         c->n_bind_mounts = 0;
6155         temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
6156         c->temporary_filesystems = NULL;
6157         c->n_temporary_filesystems = 0;
6158         c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
6159
6160         cpu_set_reset(&c->cpu_set);
6161         numa_policy_reset(&c->numa_policy);
6162
6163         c->utmp_id = mfree(c->utmp_id);
6164         c->selinux_context = mfree(c->selinux_context);
6165         c->apparmor_profile = mfree(c->apparmor_profile);
6166         c->smack_process_label = mfree(c->smack_process_label);
6167
6168         c->restrict_filesystems = set_free(c->restrict_filesystems);
6169
6170         c->syscall_filter = hashmap_free(c->syscall_filter);
6171         c->syscall_archs = set_free(c->syscall_archs);
6172         c->address_families = set_free(c->address_families);
6173
6174         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
6175                 exec_directory_done(&c->directories[t]);
6176
6177         c->log_level_max = -1;
6178
6179         exec_context_free_log_extra_fields(c);
6180         c->log_filter_allowed_patterns = set_free(c->log_filter_allowed_patterns);
6181         c->log_filter_denied_patterns = set_free(c->log_filter_denied_patterns);
6182
6183         c->log_ratelimit_interval_usec = 0;
6184         c->log_ratelimit_burst = 0;
6185
6186         c->stdin_data = mfree(c->stdin_data);
6187         c->stdin_data_size = 0;
6188
6189         c->network_namespace_path = mfree(c->network_namespace_path);
6190         c->ipc_namespace_path = mfree(c->ipc_namespace_path);
6191
6192         c->log_namespace = mfree(c->log_namespace);
6193
6194         c->load_credentials = hashmap_free(c->load_credentials);
6195         c->set_credentials = hashmap_free(c->set_credentials);
6196         c->import_credentials = set_free(c->import_credentials);
6197
6198         c->root_image_policy = image_policy_free(c->root_image_policy);
6199         c->mount_image_policy = image_policy_free(c->mount_image_policy);
6200         c->extension_image_policy = image_policy_free(c->extension_image_policy);
6201 }
6202
6203 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
6204         assert(c);
6205
6206         if (!runtime_prefix)
6207                 return 0;
6208
6209         for (size_t i = 0; i < c->directories[EXEC_DIRECTORY_RUNTIME].n_items; i++) {
6210                 _cleanup_free_ char *p = NULL;
6211
6212                 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
6213                         p = path_join(runtime_prefix, "private", c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
6214                 else
6215                         p = path_join(runtime_prefix, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
6216                 if (!p)
6217                         return -ENOMEM;
6218
6219                 /* We execute this synchronously, since we need to be sure this is gone when we start the
6220                  * service next. */
6221                 (void) rm_rf(p, REMOVE_ROOT);
6222
6223                 STRV_FOREACH(symlink, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].symlinks) {
6224                         _cleanup_free_ char *symlink_abs = NULL;
6225
6226                         if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
6227                                 symlink_abs = path_join(runtime_prefix, "private", *symlink);
6228                         else
6229                                 symlink_abs = path_join(runtime_prefix, *symlink);
6230                         if (!symlink_abs)
6231                                 return -ENOMEM;
6232
6233                         (void) unlink(symlink_abs);
6234                 }
6235         }
6236
6237         return 0;
6238 }
6239
6240 int exec_context_destroy_credentials(const ExecContext *c, const char *runtime_prefix, const char *unit) {
6241         _cleanup_free_ char *p = NULL;
6242
6243         assert(c);
6244
6245         if (!runtime_prefix || !unit)
6246                 return 0;
6247
6248         p = path_join(runtime_prefix, "credentials", unit);
6249         if (!p)
6250                 return -ENOMEM;
6251
6252         /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
6253          * unmount it, and afterwards remove the mount point */
6254         (void) umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW);
6255         (void) rm_rf(p, REMOVE_ROOT|REMOVE_CHMOD);
6256
6257         return 0;
6258 }
6259
6260 int exec_context_destroy_mount_ns_dir(Unit *u) {
6261         _cleanup_free_ char *p = NULL;
6262
6263         if (!u || !MANAGER_IS_SYSTEM(u->manager))
6264                 return 0;
6265
6266         p = path_join("/run/systemd/propagate/", u->id);
6267         if (!p)
6268                 return -ENOMEM;
6269
6270         /* This is only filled transiently (see mount_in_namespace()), should be empty or even non-existent*/
6271         if (rmdir(p) < 0 && errno != ENOENT)
6272                 log_unit_debug_errno(u, errno, "Unable to remove propagation dir '%s', ignoring: %m", p);
6273
6274         return 0;
6275 }
6276
6277 static void exec_command_done(ExecCommand *c) {
6278         assert(c);
6279
6280         c->path = mfree(c->path);
6281         c->argv = strv_free(c->argv);
6282 }
6283
6284 void exec_command_done_array(ExecCommand *c, size_t n) {
6285         for (size_t i = 0; i < n; i++)
6286                 exec_command_done(c+i);
6287 }
6288
6289 ExecCommand* exec_command_free_list(ExecCommand *c) {
6290         ExecCommand *i;
6291
6292         while ((i = c)) {
6293                 LIST_REMOVE(command, c, i);
6294                 exec_command_done(i);
6295                 free(i);
6296         }
6297
6298         return NULL;
6299 }
6300
6301 void exec_command_free_array(ExecCommand **c, size_t n) {
6302         for (size_t i = 0; i < n; i++)
6303                 c[i] = exec_command_free_list(c[i]);
6304 }
6305
6306 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
6307         for (size_t i = 0; i < n; i++)
6308                 exec_status_reset(&c[i].exec_status);
6309 }
6310
6311 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
6312         for (size_t i = 0; i < n; i++)
6313                 LIST_FOREACH(command, z, c[i])
6314                         exec_status_reset(&z->exec_status);
6315 }
6316
6317 typedef struct InvalidEnvInfo {
6318         const Unit *unit;
6319         const char *path;
6320 } InvalidEnvInfo;
6321
6322 static void invalid_env(const char *p, void *userdata) {
6323         InvalidEnvInfo *info = userdata;
6324
6325         log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
6326 }
6327
6328 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
6329         assert(c);
6330
6331         switch (fd_index) {
6332
6333         case STDIN_FILENO:
6334                 if (c->std_input != EXEC_INPUT_NAMED_FD)
6335                         return NULL;
6336
6337                 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
6338
6339         case STDOUT_FILENO:
6340                 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
6341                         return NULL;
6342
6343                 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
6344
6345         case STDERR_FILENO:
6346                 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
6347                         return NULL;
6348
6349                 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
6350
6351         default:
6352                 return NULL;
6353         }
6354 }
6355
6356 static int exec_context_named_iofds(
6357                 const ExecContext *c,
6358                 const ExecParameters *p,
6359                 int named_iofds[static 3]) {
6360
6361         size_t targets;
6362         const char* stdio_fdname[3];
6363         size_t n_fds;
6364
6365         assert(c);
6366         assert(p);
6367         assert(named_iofds);
6368
6369         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
6370                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
6371                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
6372
6373         for (size_t i = 0; i < 3; i++)
6374                 stdio_fdname[i] = exec_context_fdname(c, i);
6375
6376         n_fds = p->n_storage_fds + p->n_socket_fds;
6377
6378         for (size_t i = 0; i < n_fds  && targets > 0; i++)
6379                 if (named_iofds[STDIN_FILENO] < 0 &&
6380                     c->std_input == EXEC_INPUT_NAMED_FD &&
6381                     stdio_fdname[STDIN_FILENO] &&
6382                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
6383
6384                         named_iofds[STDIN_FILENO] = p->fds[i];
6385                         targets--;
6386
6387                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
6388                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
6389                            stdio_fdname[STDOUT_FILENO] &&
6390                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
6391
6392                         named_iofds[STDOUT_FILENO] = p->fds[i];
6393                         targets--;
6394
6395                 } else if (named_iofds[STDERR_FILENO] < 0 &&
6396                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
6397                            stdio_fdname[STDERR_FILENO] &&
6398                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
6399
6400                         named_iofds[STDERR_FILENO] = p->fds[i];
6401                         targets--;
6402                 }
6403
6404         return targets == 0 ? 0 : -ENOENT;
6405 }
6406
6407 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***ret) {
6408         _cleanup_strv_free_ char **v = NULL;
6409         int r;
6410
6411         assert(c);
6412         assert(ret);
6413
6414         STRV_FOREACH(i, c->environment_files) {
6415                 _cleanup_globfree_ glob_t pglob = {};
6416                 bool ignore = false;
6417                 char *fn = *i;
6418
6419                 if (fn[0] == '-') {
6420                         ignore = true;
6421                         fn++;
6422                 }
6423
6424                 if (!path_is_absolute(fn)) {
6425                         if (ignore)
6426                                 continue;
6427                         return -EINVAL;
6428                 }
6429
6430                 /* Filename supports globbing, take all matching files */
6431                 r = safe_glob(fn, 0, &pglob);
6432                 if (r < 0) {
6433                         if (ignore)
6434                                 continue;
6435                         return r;
6436                 }
6437
6438                 /* When we don't match anything, -ENOENT should be returned */
6439                 assert(pglob.gl_pathc > 0);
6440
6441                 for (size_t n = 0; n < pglob.gl_pathc; n++) {
6442                         _cleanup_strv_free_ char **p = NULL;
6443
6444                         r = load_env_file(NULL, pglob.gl_pathv[n], &p);
6445                         if (r < 0) {
6446                                 if (ignore)
6447                                         continue;
6448                                 return r;
6449                         }
6450
6451                         /* Log invalid environment variables with filename */
6452                         if (p) {
6453                                 InvalidEnvInfo info = {
6454                                         .unit = unit,
6455                                         .path = pglob.gl_pathv[n]
6456                                 };
6457
6458                                 p = strv_env_clean_with_callback(p, invalid_env, &info);
6459                         }
6460
6461                         if (!v)
6462                                 v = TAKE_PTR(p);
6463                         else {
6464                                 char **m = strv_env_merge(v, p);
6465                                 if (!m)
6466                                         return -ENOMEM;
6467
6468                                 strv_free_and_replace(v, m);
6469                         }
6470                 }
6471         }
6472
6473         *ret = TAKE_PTR(v);
6474
6475         return 0;
6476 }
6477
6478 static bool tty_may_match_dev_console(const char *tty) {
6479         _cleanup_free_ char *resolved = NULL;
6480
6481         if (!tty)
6482                 return true;
6483
6484         tty = skip_dev_prefix(tty);
6485
6486         /* trivial identity? */
6487         if (streq(tty, "console"))
6488                 return true;
6489
6490         if (resolve_dev_console(&resolved) < 0)
6491                 return true; /* if we could not resolve, assume it may */
6492
6493         /* "tty0" means the active VC, so it may be the same sometimes */
6494         return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
6495 }
6496
6497 static bool exec_context_may_touch_tty(const ExecContext *ec) {
6498         assert(ec);
6499
6500         return ec->tty_reset ||
6501                 ec->tty_vhangup ||
6502                 ec->tty_vt_disallocate ||
6503                 is_terminal_input(ec->std_input) ||
6504                 is_terminal_output(ec->std_output) ||
6505                 is_terminal_output(ec->std_error);
6506 }
6507
6508 bool exec_context_may_touch_console(const ExecContext *ec) {
6509
6510         return exec_context_may_touch_tty(ec) &&
6511                tty_may_match_dev_console(exec_context_tty_path(ec));
6512 }
6513
6514 static void strv_fprintf(FILE *f, char **l) {
6515         assert(f);
6516
6517         STRV_FOREACH(g, l)
6518                 fprintf(f, " %s", *g);
6519 }
6520
6521 static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
6522         assert(f);
6523         assert(prefix);
6524         assert(name);
6525
6526         if (!strv_isempty(strv)) {
6527                 fprintf(f, "%s%s:", prefix, name);
6528                 strv_fprintf(f, strv);
6529                 fputs("\n", f);
6530         }
6531 }
6532
6533 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
6534         int r;
6535
6536         assert(c);
6537         assert(f);
6538
6539         prefix = strempty(prefix);
6540
6541         fprintf(f,
6542                 "%sUMask: %04o\n"
6543                 "%sWorkingDirectory: %s\n"
6544                 "%sRootDirectory: %s\n"
6545                 "%sRootEphemeral: %s\n"
6546                 "%sNonBlocking: %s\n"
6547                 "%sPrivateTmp: %s\n"
6548                 "%sPrivateDevices: %s\n"
6549                 "%sProtectKernelTunables: %s\n"
6550                 "%sProtectKernelModules: %s\n"
6551                 "%sProtectKernelLogs: %s\n"
6552                 "%sProtectClock: %s\n"
6553                 "%sProtectControlGroups: %s\n"
6554                 "%sPrivateNetwork: %s\n"
6555                 "%sPrivateUsers: %s\n"
6556                 "%sProtectHome: %s\n"
6557                 "%sProtectSystem: %s\n"
6558                 "%sMountAPIVFS: %s\n"
6559                 "%sIgnoreSIGPIPE: %s\n"
6560                 "%sMemoryDenyWriteExecute: %s\n"
6561                 "%sRestrictRealtime: %s\n"
6562                 "%sRestrictSUIDSGID: %s\n"
6563                 "%sKeyringMode: %s\n"
6564                 "%sProtectHostname: %s\n"
6565                 "%sProtectProc: %s\n"
6566                 "%sProcSubset: %s\n",
6567                 prefix, c->umask,
6568                 prefix, empty_to_root(c->working_directory),
6569                 prefix, empty_to_root(c->root_directory),
6570                 prefix, yes_no(c->root_ephemeral),
6571                 prefix, yes_no(c->non_blocking),
6572                 prefix, yes_no(c->private_tmp),
6573                 prefix, yes_no(c->private_devices),
6574                 prefix, yes_no(c->protect_kernel_tunables),
6575                 prefix, yes_no(c->protect_kernel_modules),
6576                 prefix, yes_no(c->protect_kernel_logs),
6577                 prefix, yes_no(c->protect_clock),
6578                 prefix, yes_no(c->protect_control_groups),
6579                 prefix, yes_no(c->private_network),
6580                 prefix, yes_no(c->private_users),
6581                 prefix, protect_home_to_string(c->protect_home),
6582                 prefix, protect_system_to_string(c->protect_system),
6583                 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
6584                 prefix, yes_no(c->ignore_sigpipe),
6585                 prefix, yes_no(c->memory_deny_write_execute),
6586                 prefix, yes_no(c->restrict_realtime),
6587                 prefix, yes_no(c->restrict_suid_sgid),
6588                 prefix, exec_keyring_mode_to_string(c->keyring_mode),
6589                 prefix, yes_no(c->protect_hostname),
6590                 prefix, protect_proc_to_string(c->protect_proc),
6591                 prefix, proc_subset_to_string(c->proc_subset));
6592
6593         if (c->root_image)
6594                 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
6595
6596         if (c->root_image_options) {
6597                 fprintf(f, "%sRootImageOptions:", prefix);
6598                 LIST_FOREACH(mount_options, o, c->root_image_options)
6599                         if (!isempty(o->options))
6600                                 fprintf(f, " %s:%s",
6601                                         partition_designator_to_string(o->partition_designator),
6602                                         o->options);
6603                 fprintf(f, "\n");
6604         }
6605
6606         if (c->root_hash) {
6607                 _cleanup_free_ char *encoded = NULL;
6608                 encoded = hexmem(c->root_hash, c->root_hash_size);
6609                 if (encoded)
6610                         fprintf(f, "%sRootHash: %s\n", prefix, encoded);
6611         }
6612
6613         if (c->root_hash_path)
6614                 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
6615
6616         if (c->root_hash_sig) {
6617                 _cleanup_free_ char *encoded = NULL;
6618                 ssize_t len;
6619                 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
6620                 if (len)
6621                         fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
6622         }
6623
6624         if (c->root_hash_sig_path)
6625                 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
6626
6627         if (c->root_verity)
6628                 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
6629
6630         STRV_FOREACH(e, c->environment)
6631                 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
6632
6633         STRV_FOREACH(e, c->environment_files)
6634                 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
6635
6636         STRV_FOREACH(e, c->pass_environment)
6637                 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
6638
6639         STRV_FOREACH(e, c->unset_environment)
6640                 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
6641
6642         fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
6643
6644         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
6645                 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
6646
6647                 for (size_t i = 0; i < c->directories[dt].n_items; i++) {
6648                         fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].items[i].path);
6649
6650                         STRV_FOREACH(d, c->directories[dt].items[i].symlinks)
6651                                 fprintf(f, "%s%s: %s:%s\n", prefix, exec_directory_type_symlink_to_string(dt), c->directories[dt].items[i].path, *d);
6652                 }
6653         }
6654
6655         fprintf(f, "%sTimeoutCleanSec: %s\n", prefix, FORMAT_TIMESPAN(c->timeout_clean_usec, USEC_PER_SEC));
6656
6657         if (c->nice_set)
6658                 fprintf(f, "%sNice: %i\n", prefix, c->nice);
6659
6660         if (c->oom_score_adjust_set)
6661                 fprintf(f, "%sOOMScoreAdjust: %i\n", prefix, c->oom_score_adjust);
6662
6663         if (c->coredump_filter_set)
6664                 fprintf(f, "%sCoredumpFilter: 0x%"PRIx64"\n", prefix, c->coredump_filter);
6665
6666         for (unsigned i = 0; i < RLIM_NLIMITS; i++)
6667                 if (c->rlimit[i]) {
6668                         fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
6669                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
6670                         fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
6671                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
6672                 }
6673
6674         if (c->ioprio_set) {
6675                 _cleanup_free_ char *class_str = NULL;
6676
6677                 r = ioprio_class_to_string_alloc(ioprio_prio_class(c->ioprio), &class_str);
6678                 if (r >= 0)
6679                         fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
6680
6681                 fprintf(f, "%sIOPriority: %d\n", prefix, ioprio_prio_data(c->ioprio));
6682         }
6683
6684         if (c->cpu_sched_set) {
6685                 _cleanup_free_ char *policy_str = NULL;
6686
6687                 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
6688                 if (r >= 0)
6689                         fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
6690
6691                 fprintf(f,
6692                         "%sCPUSchedulingPriority: %i\n"
6693                         "%sCPUSchedulingResetOnFork: %s\n",
6694                         prefix, c->cpu_sched_priority,
6695                         prefix, yes_no(c->cpu_sched_reset_on_fork));
6696         }
6697
6698         if (c->cpu_set.set) {
6699                 _cleanup_free_ char *affinity = NULL;
6700
6701                 affinity = cpu_set_to_range_string(&c->cpu_set);
6702                 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
6703         }
6704
6705         if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
6706                 _cleanup_free_ char *nodes = NULL;
6707
6708                 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
6709                 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
6710                 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
6711         }
6712
6713         if (c->timer_slack_nsec != NSEC_INFINITY)
6714                 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
6715
6716         fprintf(f,
6717                 "%sStandardInput: %s\n"
6718                 "%sStandardOutput: %s\n"
6719                 "%sStandardError: %s\n",
6720                 prefix, exec_input_to_string(c->std_input),
6721                 prefix, exec_output_to_string(c->std_output),
6722                 prefix, exec_output_to_string(c->std_error));
6723
6724         if (c->std_input == EXEC_INPUT_NAMED_FD)
6725                 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
6726         if (c->std_output == EXEC_OUTPUT_NAMED_FD)
6727                 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
6728         if (c->std_error == EXEC_OUTPUT_NAMED_FD)
6729                 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
6730
6731         if (c->std_input == EXEC_INPUT_FILE)
6732                 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
6733         if (c->std_output == EXEC_OUTPUT_FILE)
6734                 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
6735         if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
6736                 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
6737         if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
6738                 fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
6739         if (c->std_error == EXEC_OUTPUT_FILE)
6740                 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
6741         if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
6742                 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
6743         if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
6744                 fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
6745
6746         if (c->tty_path)
6747                 fprintf(f,
6748                         "%sTTYPath: %s\n"
6749                         "%sTTYReset: %s\n"
6750                         "%sTTYVHangup: %s\n"
6751                         "%sTTYVTDisallocate: %s\n"
6752                         "%sTTYRows: %u\n"
6753                         "%sTTYColumns: %u\n",
6754                         prefix, c->tty_path,
6755                         prefix, yes_no(c->tty_reset),
6756                         prefix, yes_no(c->tty_vhangup),
6757                         prefix, yes_no(c->tty_vt_disallocate),
6758                         prefix, c->tty_rows,
6759                         prefix, c->tty_cols);
6760
6761         if (IN_SET(c->std_output,
6762                    EXEC_OUTPUT_KMSG,
6763                    EXEC_OUTPUT_JOURNAL,
6764                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
6765                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
6766             IN_SET(c->std_error,
6767                    EXEC_OUTPUT_KMSG,
6768                    EXEC_OUTPUT_JOURNAL,
6769                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
6770                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
6771
6772                 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
6773
6774                 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
6775                 if (r >= 0)
6776                         fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
6777
6778                 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
6779                 if (r >= 0)
6780                         fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
6781         }
6782
6783         if (c->log_level_max >= 0) {
6784                 _cleanup_free_ char *t = NULL;
6785
6786                 (void) log_level_to_string_alloc(c->log_level_max, &t);
6787
6788                 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
6789         }
6790
6791         if (c->log_ratelimit_interval_usec > 0)
6792                 fprintf(f,
6793                         "%sLogRateLimitIntervalSec: %s\n",
6794                         prefix, FORMAT_TIMESPAN(c->log_ratelimit_interval_usec, USEC_PER_SEC));
6795
6796         if (c->log_ratelimit_burst > 0)
6797                 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
6798
6799         if (!set_isempty(c->log_filter_allowed_patterns) || !set_isempty(c->log_filter_denied_patterns)) {
6800                 fprintf(f, "%sLogFilterPatterns:", prefix);
6801
6802                 char *pattern;
6803                 SET_FOREACH(pattern, c->log_filter_allowed_patterns)
6804                         fprintf(f, " %s", pattern);
6805                 SET_FOREACH(pattern, c->log_filter_denied_patterns)
6806                         fprintf(f, " ~%s", pattern);
6807                 fputc('\n', f);
6808         }
6809
6810         for (size_t j = 0; j < c->n_log_extra_fields; j++) {
6811                 fprintf(f, "%sLogExtraFields: ", prefix);
6812                 fwrite(c->log_extra_fields[j].iov_base,
6813                        1, c->log_extra_fields[j].iov_len,
6814                        f);
6815                 fputc('\n', f);
6816         }
6817
6818         if (c->log_namespace)
6819                 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
6820
6821         if (c->secure_bits) {
6822                 _cleanup_free_ char *str = NULL;
6823
6824                 r = secure_bits_to_string_alloc(c->secure_bits, &str);
6825                 if (r >= 0)
6826                         fprintf(f, "%sSecure Bits: %s\n", prefix, str);
6827         }
6828
6829         if (c->capability_bounding_set != CAP_MASK_UNSET) {
6830                 _cleanup_free_ char *str = NULL;
6831
6832                 r = capability_set_to_string(c->capability_bounding_set, &str);
6833                 if (r >= 0)
6834                         fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
6835         }
6836
6837         if (c->capability_ambient_set != 0) {
6838                 _cleanup_free_ char *str = NULL;
6839
6840                 r = capability_set_to_string(c->capability_ambient_set, &str);
6841                 if (r >= 0)
6842                         fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
6843         }
6844
6845         if (c->user)
6846                 fprintf(f, "%sUser: %s\n", prefix, c->user);
6847         if (c->group)
6848                 fprintf(f, "%sGroup: %s\n", prefix, c->group);
6849
6850         fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
6851
6852         strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
6853
6854         if (c->pam_name)
6855                 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
6856
6857         strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
6858         strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
6859         strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
6860         strv_dump(f, prefix, "ExecPaths", c->exec_paths);
6861         strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
6862         strv_dump(f, prefix, "ExecSearchPath", c->exec_search_path);
6863
6864         for (size_t i = 0; i < c->n_bind_mounts; i++)
6865                 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
6866                         c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
6867                         c->bind_mounts[i].ignore_enoent ? "-": "",
6868                         c->bind_mounts[i].source,
6869                         c->bind_mounts[i].destination,
6870                         c->bind_mounts[i].recursive ? "rbind" : "norbind");
6871
6872         for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
6873                 const TemporaryFileSystem *t = c->temporary_filesystems + i;
6874
6875                 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
6876                         t->path,
6877                         isempty(t->options) ? "" : ":",
6878                         strempty(t->options));
6879         }
6880
6881         if (c->utmp_id)
6882                 fprintf(f,
6883                         "%sUtmpIdentifier: %s\n",
6884                         prefix, c->utmp_id);
6885
6886         if (c->selinux_context)
6887                 fprintf(f,
6888                         "%sSELinuxContext: %s%s\n",
6889                         prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
6890
6891         if (c->apparmor_profile)
6892                 fprintf(f,
6893                         "%sAppArmorProfile: %s%s\n",
6894                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
6895
6896         if (c->smack_process_label)
6897                 fprintf(f,
6898                         "%sSmackProcessLabel: %s%s\n",
6899                         prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
6900
6901         if (c->personality != PERSONALITY_INVALID)
6902                 fprintf(f,
6903                         "%sPersonality: %s\n",
6904                         prefix, strna(personality_to_string(c->personality)));
6905
6906         fprintf(f,
6907                 "%sLockPersonality: %s\n",
6908                 prefix, yes_no(c->lock_personality));
6909
6910         if (c->syscall_filter) {
6911                 fprintf(f,
6912                         "%sSystemCallFilter: ",
6913                         prefix);
6914
6915                 if (!c->syscall_allow_list)
6916                         fputc('~', f);
6917
6918 #if HAVE_SECCOMP
6919                 void *id, *val;
6920                 bool first = true;
6921                 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
6922                         _cleanup_free_ char *name = NULL;
6923                         const char *errno_name = NULL;
6924                         int num = PTR_TO_INT(val);
6925
6926                         if (first)
6927                                 first = false;
6928                         else
6929                                 fputc(' ', f);
6930
6931                         name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
6932                         fputs(strna(name), f);
6933
6934                         if (num >= 0) {
6935                                 errno_name = seccomp_errno_or_action_to_string(num);
6936                                 if (errno_name)
6937                                         fprintf(f, ":%s", errno_name);
6938                                 else
6939                                         fprintf(f, ":%d", num);
6940                         }
6941                 }
6942 #endif
6943
6944                 fputc('\n', f);
6945         }
6946
6947         if (c->syscall_archs) {
6948                 fprintf(f,
6949                         "%sSystemCallArchitectures:",
6950                         prefix);
6951
6952 #if HAVE_SECCOMP
6953                 void *id;
6954                 SET_FOREACH(id, c->syscall_archs)
6955                         fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
6956 #endif
6957                 fputc('\n', f);
6958         }
6959
6960         if (exec_context_restrict_namespaces_set(c)) {
6961                 _cleanup_free_ char *s = NULL;
6962
6963                 r = namespace_flags_to_string(c->restrict_namespaces, &s);
6964                 if (r >= 0)
6965                         fprintf(f, "%sRestrictNamespaces: %s\n",
6966                                 prefix, strna(s));
6967         }
6968
6969 #if HAVE_LIBBPF
6970         if (exec_context_restrict_filesystems_set(c)) {
6971                 char *fs;
6972                 SET_FOREACH(fs, c->restrict_filesystems)
6973                         fprintf(f, "%sRestrictFileSystems: %s\n", prefix, fs);
6974         }
6975 #endif
6976
6977         if (c->network_namespace_path)
6978                 fprintf(f,
6979                         "%sNetworkNamespacePath: %s\n",
6980                         prefix, c->network_namespace_path);
6981
6982         if (c->syscall_errno > 0) {
6983                 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
6984
6985 #if HAVE_SECCOMP
6986                 const char *errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
6987                 if (errno_name)
6988                         fputs(errno_name, f);
6989                 else
6990                         fprintf(f, "%d", c->syscall_errno);
6991 #endif
6992                 fputc('\n', f);
6993         }
6994
6995         for (size_t i = 0; i < c->n_mount_images; i++) {
6996                 fprintf(f, "%sMountImages: %s%s:%s", prefix,
6997                         c->mount_images[i].ignore_enoent ? "-": "",
6998                         c->mount_images[i].source,
6999                         c->mount_images[i].destination);
7000                 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
7001                         fprintf(f, ":%s:%s",
7002                                 partition_designator_to_string(o->partition_designator),
7003                                 strempty(o->options));
7004                 fprintf(f, "\n");
7005         }
7006
7007         for (size_t i = 0; i < c->n_extension_images; i++) {
7008                 fprintf(f, "%sExtensionImages: %s%s", prefix,
7009                         c->extension_images[i].ignore_enoent ? "-": "",
7010                         c->extension_images[i].source);
7011                 LIST_FOREACH(mount_options, o, c->extension_images[i].mount_options)
7012                         fprintf(f, ":%s:%s",
7013                                 partition_designator_to_string(o->partition_designator),
7014                                 strempty(o->options));
7015                 fprintf(f, "\n");
7016         }
7017
7018         strv_dump(f, prefix, "ExtensionDirectories", c->extension_directories);
7019 }
7020
7021 bool exec_context_maintains_privileges(const ExecContext *c) {
7022         assert(c);
7023
7024         /* Returns true if the process forked off would run under
7025          * an unchanged UID or as root. */
7026
7027         if (!c->user)
7028                 return true;
7029
7030         if (streq(c->user, "root") || streq(c->user, "0"))
7031                 return true;
7032
7033         return false;
7034 }
7035
7036 int exec_context_get_effective_ioprio(const ExecContext *c) {
7037         int p;
7038
7039         assert(c);
7040
7041         if (c->ioprio_set)
7042                 return c->ioprio;
7043
7044         p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
7045         if (p < 0)
7046                 return IOPRIO_DEFAULT_CLASS_AND_PRIO;
7047
7048         return ioprio_normalize(p);
7049 }
7050
7051 bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
7052         assert(c);
7053
7054         /* Explicit setting wins */
7055         if (c->mount_apivfs_set)
7056                 return c->mount_apivfs;
7057
7058         /* Default to "yes" if root directory or image are specified */
7059         if (exec_context_with_rootfs(c))
7060                 return true;
7061
7062         return false;
7063 }
7064
7065 void exec_context_free_log_extra_fields(ExecContext *c) {
7066         assert(c);
7067
7068         for (size_t l = 0; l < c->n_log_extra_fields; l++)
7069                 free(c->log_extra_fields[l].iov_base);
7070         c->log_extra_fields = mfree(c->log_extra_fields);
7071         c->n_log_extra_fields = 0;
7072 }
7073
7074 void exec_context_revert_tty(ExecContext *c) {
7075         _cleanup_close_ int fd = -EBADF;
7076         const char *path;
7077         struct stat st;
7078         int r;
7079
7080         assert(c);
7081
7082         /* First, reset the TTY (possibly kicking everybody else from the TTY) */
7083         exec_context_tty_reset(c, NULL);
7084
7085         /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
7086          * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
7087          * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
7088         if (!exec_context_may_touch_tty(c))
7089                 return;
7090
7091         path = exec_context_tty_path(c);
7092         if (!path)
7093                 return;
7094
7095         fd = open(path, O_PATH|O_CLOEXEC);
7096         if (fd < 0)
7097                 return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
7098                                              "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
7099                                              path);
7100
7101         if (fstat(fd, &st) < 0)
7102                 return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
7103
7104         /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
7105          * if things are a character device, since a proper check either means we'd have to open the TTY and
7106          * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
7107          * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
7108          * with this at all? → https://github.com/systemd/systemd/issues/19213 */
7109         if (!S_ISCHR(st.st_mode))
7110                 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
7111
7112         r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
7113         if (r < 0)
7114                 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
7115 }
7116
7117 int exec_context_get_clean_directories(
7118                 ExecContext *c,
7119                 char **prefix,
7120                 ExecCleanMask mask,
7121                 char ***ret) {
7122
7123         _cleanup_strv_free_ char **l = NULL;
7124         int r;
7125
7126         assert(c);
7127         assert(prefix);
7128         assert(ret);
7129
7130         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
7131                 if (!FLAGS_SET(mask, 1U << t))
7132                         continue;
7133
7134                 if (!prefix[t])
7135                         continue;
7136
7137                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
7138                         char *j;
7139
7140                         j = path_join(prefix[t], c->directories[t].items[i].path);
7141                         if (!j)
7142                                 return -ENOMEM;
7143
7144                         r = strv_consume(&l, j);
7145                         if (r < 0)
7146                                 return r;
7147
7148                         /* Also remove private directories unconditionally. */
7149                         if (t != EXEC_DIRECTORY_CONFIGURATION) {
7150                                 j = path_join(prefix[t], "private", c->directories[t].items[i].path);
7151                                 if (!j)
7152                                         return -ENOMEM;
7153
7154                                 r = strv_consume(&l, j);
7155                                 if (r < 0)
7156                                         return r;
7157                         }
7158
7159                         STRV_FOREACH(symlink, c->directories[t].items[i].symlinks) {
7160                                 j = path_join(prefix[t], *symlink);
7161                                 if (!j)
7162                                         return -ENOMEM;
7163
7164                                 r = strv_consume(&l, j);
7165                                 if (r < 0)
7166                                         return r;
7167                         }
7168                 }
7169         }
7170
7171         *ret = TAKE_PTR(l);
7172         return 0;
7173 }
7174
7175 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
7176         ExecCleanMask mask = 0;
7177
7178         assert(c);
7179         assert(ret);
7180
7181         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
7182                 if (c->directories[t].n_items > 0)
7183                         mask |= 1U << t;
7184
7185         *ret = mask;
7186         return 0;
7187 }
7188
7189 bool exec_context_has_encrypted_credentials(ExecContext *c) {
7190         ExecLoadCredential *load_cred;
7191         ExecSetCredential *set_cred;
7192
7193         assert(c);
7194
7195         HASHMAP_FOREACH(load_cred, c->load_credentials)
7196                 if (load_cred->encrypted)
7197                         return true;
7198
7199         HASHMAP_FOREACH(set_cred, c->set_credentials)
7200                 if (set_cred->encrypted)
7201                         return true;
7202
7203         return false;
7204 }
7205
7206 void exec_status_start(ExecStatus *s, pid_t pid) {
7207         assert(s);
7208
7209         *s = (ExecStatus) {
7210                 .pid = pid,
7211         };
7212
7213         dual_timestamp_get(&s->start_timestamp);
7214 }
7215
7216 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
7217         assert(s);
7218
7219         if (s->pid != pid)
7220                 *s = (ExecStatus) {
7221                         .pid = pid,
7222                 };
7223
7224         dual_timestamp_get(&s->exit_timestamp);
7225
7226         s->code = code;
7227         s->status = status;
7228
7229         if (context && context->utmp_id)
7230                 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
7231 }
7232
7233 void exec_status_reset(ExecStatus *s) {
7234         assert(s);
7235
7236         *s = (ExecStatus) {};
7237 }
7238
7239 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
7240         assert(s);
7241         assert(f);
7242
7243         if (s->pid <= 0)
7244                 return;
7245
7246         prefix = strempty(prefix);
7247
7248         fprintf(f,
7249                 "%sPID: "PID_FMT"\n",
7250                 prefix, s->pid);
7251
7252         if (dual_timestamp_is_set(&s->start_timestamp))
7253                 fprintf(f,
7254                         "%sStart Timestamp: %s\n",
7255                         prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
7256
7257         if (dual_timestamp_is_set(&s->exit_timestamp))
7258                 fprintf(f,
7259                         "%sExit Timestamp: %s\n"
7260                         "%sExit Code: %s\n"
7261                         "%sExit Status: %i\n",
7262                         prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime),
7263                         prefix, sigchld_code_to_string(s->code),
7264                         prefix, s->status);
7265 }
7266
7267 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
7268         _cleanup_free_ char *cmd = NULL;
7269         const char *prefix2;
7270
7271         assert(c);
7272         assert(f);
7273
7274         prefix = strempty(prefix);
7275         prefix2 = strjoina(prefix, "\t");
7276
7277         cmd = quote_command_line(c->argv, SHELL_ESCAPE_EMPTY);
7278
7279         fprintf(f,
7280                 "%sCommand Line: %s\n",
7281                 prefix, strnull(cmd));
7282
7283         exec_status_dump(&c->exec_status, f, prefix2);
7284 }
7285
7286 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
7287         assert(f);
7288
7289         prefix = strempty(prefix);
7290
7291         LIST_FOREACH(command, i, c)
7292                 exec_command_dump(i, f, prefix);
7293 }
7294
7295 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
7296         ExecCommand *end;
7297
7298         assert(l);
7299         assert(e);
7300
7301         if (*l) {
7302                 /* It's kind of important, that we keep the order here */
7303                 end = LIST_FIND_TAIL(command, *l);
7304                 LIST_INSERT_AFTER(command, *l, end, e);
7305         } else
7306                 *l = e;
7307 }
7308
7309 int exec_command_set(ExecCommand *c, const char *path, ...) {
7310         va_list ap;
7311         char **l, *p;
7312
7313         assert(c);
7314         assert(path);
7315
7316         va_start(ap, path);
7317         l = strv_new_ap(path, ap);
7318         va_end(ap);
7319
7320         if (!l)
7321                 return -ENOMEM;
7322
7323         p = strdup(path);
7324         if (!p) {
7325                 strv_free(l);
7326                 return -ENOMEM;
7327         }
7328
7329         free_and_replace(c->path, p);
7330
7331         return strv_free_and_replace(c->argv, l);
7332 }
7333
7334 int exec_command_append(ExecCommand *c, const char *path, ...) {
7335         _cleanup_strv_free_ char **l = NULL;
7336         va_list ap;
7337         int r;
7338
7339         assert(c);
7340         assert(path);
7341
7342         va_start(ap, path);
7343         l = strv_new_ap(path, ap);
7344         va_end(ap);
7345
7346         if (!l)
7347                 return -ENOMEM;
7348
7349         r = strv_extend_strv(&c->argv, l, false);
7350         if (r < 0)
7351                 return r;
7352
7353         return 0;
7354 }
7355
7356 static char *destroy_tree(char *path) {
7357         if (!path)
7358                 return NULL;
7359
7360         if (!path_equal(path, RUN_SYSTEMD_EMPTY)) {
7361                 log_debug("Spawning process to nuke '%s'", path);
7362
7363                 (void) asynchronous_rm_rf(path, REMOVE_ROOT|REMOVE_SUBVOLUME|REMOVE_PHYSICAL);
7364         }
7365
7366         return mfree(path);
7367 }
7368
7369 static ExecSharedRuntime* exec_shared_runtime_free(ExecSharedRuntime *rt) {
7370         if (!rt)
7371                 return NULL;
7372
7373         if (rt->manager)
7374                 (void) hashmap_remove(rt->manager->exec_shared_runtime_by_id, rt->id);
7375
7376         rt->id = mfree(rt->id);
7377         rt->tmp_dir = mfree(rt->tmp_dir);
7378         rt->var_tmp_dir = mfree(rt->var_tmp_dir);
7379         safe_close_pair(rt->netns_storage_socket);
7380         safe_close_pair(rt->ipcns_storage_socket);
7381         return mfree(rt);
7382 }
7383
7384 DEFINE_TRIVIAL_UNREF_FUNC(ExecSharedRuntime, exec_shared_runtime, exec_shared_runtime_free);
7385 DEFINE_TRIVIAL_CLEANUP_FUNC(ExecSharedRuntime*, exec_shared_runtime_free);
7386
7387 ExecSharedRuntime* exec_shared_runtime_destroy(ExecSharedRuntime *rt) {
7388         if (!rt)
7389                 return NULL;
7390
7391         assert(rt->n_ref > 0);
7392         rt->n_ref--;
7393
7394         if (rt->n_ref > 0)
7395                 return NULL;
7396
7397         rt->tmp_dir = destroy_tree(rt->tmp_dir);
7398         rt->var_tmp_dir = destroy_tree(rt->var_tmp_dir);
7399
7400         return exec_shared_runtime_free(rt);
7401 }
7402
7403 static int exec_shared_runtime_allocate(ExecSharedRuntime **ret, const char *id) {
7404         _cleanup_free_ char *id_copy = NULL;
7405         ExecSharedRuntime *n;
7406
7407         assert(ret);
7408
7409         id_copy = strdup(id);
7410         if (!id_copy)
7411                 return -ENOMEM;
7412
7413         n = new(ExecSharedRuntime, 1);
7414         if (!n)
7415                 return -ENOMEM;
7416
7417         *n = (ExecSharedRuntime) {
7418                 .id = TAKE_PTR(id_copy),
7419                 .netns_storage_socket = PIPE_EBADF,
7420                 .ipcns_storage_socket = PIPE_EBADF,
7421         };
7422
7423         *ret = n;
7424         return 0;
7425 }
7426
7427 static int exec_shared_runtime_add(
7428                 Manager *m,
7429                 const char *id,
7430                 char **tmp_dir,
7431                 char **var_tmp_dir,
7432                 int netns_storage_socket[2],
7433                 int ipcns_storage_socket[2],
7434                 ExecSharedRuntime **ret) {
7435
7436         _cleanup_(exec_shared_runtime_freep) ExecSharedRuntime *rt = NULL;
7437         int r;
7438
7439         assert(m);
7440         assert(id);
7441
7442         /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
7443
7444         r = exec_shared_runtime_allocate(&rt, id);
7445         if (r < 0)
7446                 return r;
7447
7448         r = hashmap_ensure_put(&m->exec_shared_runtime_by_id, &string_hash_ops, rt->id, rt);
7449         if (r < 0)
7450                 return r;
7451
7452         assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
7453         rt->tmp_dir = TAKE_PTR(*tmp_dir);
7454         rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
7455
7456         if (netns_storage_socket) {
7457                 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
7458                 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
7459         }
7460
7461         if (ipcns_storage_socket) {
7462                 rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
7463                 rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
7464         }
7465
7466         rt->manager = m;
7467
7468         if (ret)
7469                 *ret = rt;
7470         /* do not remove created ExecSharedRuntime object when the operation succeeds. */
7471         TAKE_PTR(rt);
7472         return 0;
7473 }
7474
7475 static int exec_shared_runtime_make(
7476                 Manager *m,
7477                 const ExecContext *c,
7478                 const char *id,
7479                 ExecSharedRuntime **ret) {
7480
7481         _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
7482         _cleanup_close_pair_ int netns_storage_socket[2] = PIPE_EBADF, ipcns_storage_socket[2] = PIPE_EBADF;
7483         int r;
7484
7485         assert(m);
7486         assert(c);
7487         assert(id);
7488
7489         /* It is not necessary to create ExecSharedRuntime object. */
7490         if (!exec_needs_network_namespace(c) && !exec_needs_ipc_namespace(c) && !c->private_tmp) {
7491                 *ret = NULL;
7492                 return 0;
7493         }
7494
7495         if (c->private_tmp &&
7496             !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
7497               (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
7498                prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
7499                 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
7500                 if (r < 0)
7501                         return r;
7502         }
7503
7504         if (exec_needs_network_namespace(c)) {
7505                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
7506                         return -errno;
7507         }
7508
7509         if (exec_needs_ipc_namespace(c)) {
7510                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
7511                         return -errno;
7512         }
7513
7514         r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
7515         if (r < 0)
7516                 return r;
7517
7518         return 1;
7519 }
7520
7521 int exec_shared_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecSharedRuntime **ret) {
7522         ExecSharedRuntime *rt;
7523         int r;
7524
7525         assert(m);
7526         assert(id);
7527         assert(ret);
7528
7529         rt = hashmap_get(m->exec_shared_runtime_by_id, id);
7530         if (rt)
7531                 /* We already have an ExecSharedRuntime object, let's increase the ref count and reuse it */
7532                 goto ref;
7533
7534         if (!create) {
7535                 *ret = NULL;
7536                 return 0;
7537         }
7538
7539         /* If not found, then create a new object. */
7540         r = exec_shared_runtime_make(m, c, id, &rt);
7541         if (r < 0)
7542                 return r;
7543         if (r == 0) {
7544                 /* When r == 0, it is not necessary to create ExecSharedRuntime object. */
7545                 *ret = NULL;
7546                 return 0;
7547         }
7548
7549 ref:
7550         /* increment reference counter. */
7551         rt->n_ref++;
7552         *ret = rt;
7553         return 1;
7554 }
7555
7556 int exec_shared_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
7557         ExecSharedRuntime *rt;
7558
7559         assert(m);
7560         assert(f);
7561         assert(fds);
7562
7563         HASHMAP_FOREACH(rt, m->exec_shared_runtime_by_id) {
7564                 fprintf(f, "exec-runtime=%s", rt->id);
7565
7566                 if (rt->tmp_dir)
7567                         fprintf(f, " tmp-dir=%s", rt->tmp_dir);
7568
7569                 if (rt->var_tmp_dir)
7570                         fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
7571
7572                 if (rt->netns_storage_socket[0] >= 0) {
7573                         int copy;
7574
7575                         copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
7576                         if (copy < 0)
7577                                 return copy;
7578
7579                         fprintf(f, " netns-socket-0=%i", copy);
7580                 }
7581
7582                 if (rt->netns_storage_socket[1] >= 0) {
7583                         int copy;
7584
7585                         copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
7586                         if (copy < 0)
7587                                 return copy;
7588
7589                         fprintf(f, " netns-socket-1=%i", copy);
7590                 }
7591
7592                 if (rt->ipcns_storage_socket[0] >= 0) {
7593                         int copy;
7594
7595                         copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
7596                         if (copy < 0)
7597                                 return copy;
7598
7599                         fprintf(f, " ipcns-socket-0=%i", copy);
7600                 }
7601
7602                 if (rt->ipcns_storage_socket[1] >= 0) {
7603                         int copy;
7604
7605                         copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
7606                         if (copy < 0)
7607                                 return copy;
7608
7609                         fprintf(f, " ipcns-socket-1=%i", copy);
7610                 }
7611
7612                 fputc('\n', f);
7613         }
7614
7615         return 0;
7616 }
7617
7618 int exec_shared_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
7619         _cleanup_(exec_shared_runtime_freep) ExecSharedRuntime *rt_create = NULL;
7620         ExecSharedRuntime *rt;
7621         int r;
7622
7623         /* This is for the migration from old (v237 or earlier) deserialization text.
7624          * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
7625          * Even if the ExecSharedRuntime object originally created by the other unit, we cannot judge
7626          * so or not from the serialized text, then we always creates a new object owned by this. */
7627
7628         assert(u);
7629         assert(key);
7630         assert(value);
7631
7632         /* Manager manages ExecSharedRuntime objects by the unit id.
7633          * So, we omit the serialized text when the unit does not have id (yet?)... */
7634         if (isempty(u->id)) {
7635                 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
7636                 return 0;
7637         }
7638
7639         if (hashmap_ensure_allocated(&u->manager->exec_shared_runtime_by_id, &string_hash_ops) < 0)
7640                 return log_oom();
7641
7642         rt = hashmap_get(u->manager->exec_shared_runtime_by_id, u->id);
7643         if (!rt) {
7644                 if (exec_shared_runtime_allocate(&rt_create, u->id) < 0)
7645                         return log_oom();
7646
7647                 rt = rt_create;
7648         }
7649
7650         if (streq(key, "tmp-dir")) {
7651                 if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
7652                         return -ENOMEM;
7653
7654         } else if (streq(key, "var-tmp-dir")) {
7655                 if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
7656                         return -ENOMEM;
7657
7658         } else if (streq(key, "netns-socket-0")) {
7659                 int fd;
7660
7661                 if ((fd = parse_fd(value)) < 0 || !fdset_contains(fds, fd)) {
7662                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
7663                         return 0;
7664                 }
7665
7666                 safe_close(rt->netns_storage_socket[0]);
7667                 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
7668
7669         } else if (streq(key, "netns-socket-1")) {
7670                 int fd;
7671
7672                 if ((fd = parse_fd(value)) < 0 || !fdset_contains(fds, fd)) {
7673                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
7674                         return 0;
7675                 }
7676
7677                 safe_close(rt->netns_storage_socket[1]);
7678                 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
7679
7680         } else
7681                 return 0;
7682
7683         /* If the object is newly created, then put it to the hashmap which manages ExecSharedRuntime objects. */
7684         if (rt_create) {
7685                 r = hashmap_put(u->manager->exec_shared_runtime_by_id, rt_create->id, rt_create);
7686                 if (r < 0) {
7687                         log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
7688                         return 0;
7689                 }
7690
7691                 rt_create->manager = u->manager;
7692
7693                 /* Avoid cleanup */
7694                 TAKE_PTR(rt_create);
7695         }
7696
7697         return 1;
7698 }
7699
7700 int exec_shared_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
7701         _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
7702         char *id = NULL;
7703         int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
7704         const char *p, *v = ASSERT_PTR(value);
7705         size_t n;
7706
7707         assert(m);
7708         assert(fds);
7709
7710         n = strcspn(v, " ");
7711         id = strndupa_safe(v, n);
7712         if (v[n] != ' ')
7713                 goto finalize;
7714         p = v + n + 1;
7715
7716         v = startswith(p, "tmp-dir=");
7717         if (v) {
7718                 n = strcspn(v, " ");
7719                 tmp_dir = strndup(v, n);
7720                 if (!tmp_dir)
7721                         return log_oom();
7722                 if (v[n] != ' ')
7723                         goto finalize;
7724                 p = v + n + 1;
7725         }
7726
7727         v = startswith(p, "var-tmp-dir=");
7728         if (v) {
7729                 n = strcspn(v, " ");
7730                 var_tmp_dir = strndup(v, n);
7731                 if (!var_tmp_dir)
7732                         return log_oom();
7733                 if (v[n] != ' ')
7734                         goto finalize;
7735                 p = v + n + 1;
7736         }
7737
7738         v = startswith(p, "netns-socket-0=");
7739         if (v) {
7740                 char *buf;
7741
7742                 n = strcspn(v, " ");
7743                 buf = strndupa_safe(v, n);
7744
7745                 netns_fdpair[0] = parse_fd(buf);
7746                 if (netns_fdpair[0] < 0)
7747                         return log_debug_errno(netns_fdpair[0], "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
7748                 if (!fdset_contains(fds, netns_fdpair[0]))
7749                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7750                                                "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair[0]);
7751                 netns_fdpair[0] = fdset_remove(fds, netns_fdpair[0]);
7752                 if (v[n] != ' ')
7753                         goto finalize;
7754                 p = v + n + 1;
7755         }
7756
7757         v = startswith(p, "netns-socket-1=");
7758         if (v) {
7759                 char *buf;
7760
7761                 n = strcspn(v, " ");
7762                 buf = strndupa_safe(v, n);
7763
7764                 netns_fdpair[1] = parse_fd(buf);
7765                 if (netns_fdpair[1] < 0)
7766                         return log_debug_errno(netns_fdpair[1], "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
7767                 if (!fdset_contains(fds, netns_fdpair[1]))
7768                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7769                                                "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair[1]);
7770                 netns_fdpair[1] = fdset_remove(fds, netns_fdpair[1]);
7771                 if (v[n] != ' ')
7772                         goto finalize;
7773                 p = v + n + 1;
7774         }
7775
7776         v = startswith(p, "ipcns-socket-0=");
7777         if (v) {
7778                 char *buf;
7779
7780                 n = strcspn(v, " ");
7781                 buf = strndupa_safe(v, n);
7782
7783                 ipcns_fdpair[0] = parse_fd(buf);
7784                 if (ipcns_fdpair[0] < 0)
7785                         return log_debug_errno(ipcns_fdpair[0], "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf);
7786                 if (!fdset_contains(fds, ipcns_fdpair[0]))
7787                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7788                                                "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair[0]);
7789                 ipcns_fdpair[0] = fdset_remove(fds, ipcns_fdpair[0]);
7790                 if (v[n] != ' ')
7791                         goto finalize;
7792                 p = v + n + 1;
7793         }
7794
7795         v = startswith(p, "ipcns-socket-1=");
7796         if (v) {
7797                 char *buf;
7798
7799                 n = strcspn(v, " ");
7800                 buf = strndupa_safe(v, n);
7801
7802                 ipcns_fdpair[1] = parse_fd(buf);
7803                 if (ipcns_fdpair[1] < 0)
7804                         return log_debug_errno(ipcns_fdpair[1], "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf);
7805                 if (!fdset_contains(fds, ipcns_fdpair[1]))
7806                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7807                                                "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair[1]);
7808                 ipcns_fdpair[1] = fdset_remove(fds, ipcns_fdpair[1]);
7809         }
7810
7811 finalize:
7812         r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
7813         if (r < 0)
7814                 return log_debug_errno(r, "Failed to add exec-runtime: %m");
7815         return 0;
7816 }
7817
7818 void exec_shared_runtime_vacuum(Manager *m) {
7819         ExecSharedRuntime *rt;
7820
7821         assert(m);
7822
7823         /* Free unreferenced ExecSharedRuntime objects. This is used after manager deserialization process. */
7824
7825         HASHMAP_FOREACH(rt, m->exec_shared_runtime_by_id) {
7826                 if (rt->n_ref > 0)
7827                         continue;
7828
7829                 (void) exec_shared_runtime_free(rt);
7830         }
7831 }
7832
7833 int exec_runtime_make(
7834                 const Unit *unit,
7835                 const ExecContext *context,
7836                 ExecSharedRuntime *shared,
7837                 DynamicCreds *creds,
7838                 ExecRuntime **ret) {
7839         _cleanup_close_pair_ int ephemeral_storage_socket[2] = PIPE_EBADF;
7840         _cleanup_free_ char *ephemeral = NULL;
7841         _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
7842         int r;
7843
7844         assert(unit);
7845         assert(context);
7846         assert(ret);
7847
7848         if (!shared && !creds && !exec_needs_ephemeral(context)) {
7849                 *ret = NULL;
7850                 return 0;
7851         }
7852
7853         if (exec_needs_ephemeral(context)) {
7854                 r = mkdir_p("/var/lib/systemd/ephemeral-trees", 0755);
7855                 if (r < 0)
7856                         return r;
7857
7858                 r = tempfn_random_child("/var/lib/systemd/ephemeral-trees", unit->id, &ephemeral);
7859                 if (r < 0)
7860                         return r;
7861
7862                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ephemeral_storage_socket) < 0)
7863                         return -errno;
7864         }
7865
7866         rt = new(ExecRuntime, 1);
7867         if (!rt)
7868                 return -ENOMEM;
7869
7870         *rt = (ExecRuntime) {
7871                 .shared = shared,
7872                 .dynamic_creds = creds,
7873                 .ephemeral_copy = TAKE_PTR(ephemeral),
7874                 .ephemeral_storage_socket[0] = TAKE_FD(ephemeral_storage_socket[0]),
7875                 .ephemeral_storage_socket[1] = TAKE_FD(ephemeral_storage_socket[1]),
7876         };
7877
7878         *ret = TAKE_PTR(rt);
7879         return 1;
7880 }
7881
7882 ExecRuntime* exec_runtime_free(ExecRuntime *rt) {
7883         if (!rt)
7884                 return NULL;
7885
7886         exec_shared_runtime_unref(rt->shared);
7887         dynamic_creds_unref(rt->dynamic_creds);
7888
7889         rt->ephemeral_copy = destroy_tree(rt->ephemeral_copy);
7890
7891         safe_close_pair(rt->ephemeral_storage_socket);
7892         return mfree(rt);
7893 }
7894
7895 ExecRuntime* exec_runtime_destroy(ExecRuntime *rt) {
7896         if (!rt)
7897                 return NULL;
7898
7899         rt->shared = exec_shared_runtime_destroy(rt->shared);
7900         rt->dynamic_creds = dynamic_creds_destroy(rt->dynamic_creds);
7901         return exec_runtime_free(rt);
7902 }
7903
7904 void exec_params_clear(ExecParameters *p) {
7905         if (!p)
7906                 return;
7907
7908         p->environment = strv_free(p->environment);
7909         p->fd_names = strv_free(p->fd_names);
7910         p->fds = mfree(p->fds);
7911         p->exec_fd = safe_close(p->exec_fd);
7912 }
7913
7914 ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) {
7915         if (!sc)
7916                 return NULL;
7917
7918         free(sc->id);
7919         free(sc->data);
7920         return mfree(sc);
7921 }
7922
7923 ExecLoadCredential *exec_load_credential_free(ExecLoadCredential *lc) {
7924         if (!lc)
7925                 return NULL;
7926
7927         free(lc->id);
7928         free(lc->path);
7929         return mfree(lc);
7930 }
7931
7932 void exec_directory_done(ExecDirectory *d) {
7933         if (!d)
7934                 return;
7935
7936         for (size_t i = 0; i < d->n_items; i++) {
7937                 free(d->items[i].path);
7938                 strv_free(d->items[i].symlinks);
7939         }
7940
7941         d->items = mfree(d->items);
7942         d->n_items = 0;
7943         d->mode = 0755;
7944 }
7945
7946 static ExecDirectoryItem *exec_directory_find(ExecDirectory *d, const char *path) {
7947         assert(d);
7948         assert(path);
7949
7950         for (size_t i = 0; i < d->n_items; i++)
7951                 if (path_equal(d->items[i].path, path))
7952                         return &d->items[i];
7953
7954         return NULL;
7955 }
7956
7957 int exec_directory_add(ExecDirectory *d, const char *path, const char *symlink) {
7958         _cleanup_strv_free_ char **s = NULL;
7959         _cleanup_free_ char *p = NULL;
7960         ExecDirectoryItem *existing;
7961         int r;
7962
7963         assert(d);
7964         assert(path);
7965
7966         existing = exec_directory_find(d, path);
7967         if (existing) {
7968                 r = strv_extend(&existing->symlinks, symlink);
7969                 if (r < 0)
7970                         return r;
7971
7972                 return 0; /* existing item is updated */
7973         }
7974
7975         p = strdup(path);
7976         if (!p)
7977                 return -ENOMEM;
7978
7979         if (symlink) {
7980                 s = strv_new(symlink);
7981                 if (!s)
7982                         return -ENOMEM;
7983         }
7984
7985         if (!GREEDY_REALLOC(d->items, d->n_items + 1))
7986                 return -ENOMEM;
7987
7988         d->items[d->n_items++] = (ExecDirectoryItem) {
7989                 .path = TAKE_PTR(p),
7990                 .symlinks = TAKE_PTR(s),
7991         };
7992
7993         return 1; /* new item is added */
7994 }
7995
7996 static int exec_directory_item_compare_func(const ExecDirectoryItem *a, const ExecDirectoryItem *b) {
7997         assert(a);
7998         assert(b);
7999
8000         return path_compare(a->path, b->path);
8001 }
8002
8003 void exec_directory_sort(ExecDirectory *d) {
8004         assert(d);
8005
8006         /* Sort the exec directories to make always parent directories processed at first in
8007          * setup_exec_directory(), e.g., even if StateDirectory=foo/bar foo, we need to create foo at first,
8008          * then foo/bar. Also, set .only_create flag if one of the parent directories is contained in the
8009          * list. See also comments in setup_exec_directory() and issue #24783. */
8010
8011         if (d->n_items <= 1)
8012                 return;
8013
8014         typesafe_qsort(d->items, d->n_items, exec_directory_item_compare_func);
8015
8016         for (size_t i = 1; i < d->n_items; i++)
8017                 for (size_t j = 0; j < i; j++)
8018                         if (path_startswith(d->items[i].path, d->items[j].path)) {
8019                                 d->items[i].only_create = true;
8020                                 break;
8021                         }
8022 }
8023
8024 ExecCleanMask exec_clean_mask_from_string(const char *s) {
8025         ExecDirectoryType t;
8026
8027         assert(s);
8028
8029         if (streq(s, "all"))
8030                 return EXEC_CLEAN_ALL;
8031         if (streq(s, "fdstore"))
8032                 return EXEC_CLEAN_FDSTORE;
8033
8034         t = exec_resource_type_from_string(s);
8035         if (t < 0)
8036                 return (ExecCleanMask) t;
8037
8038         return 1U << t;
8039 }
8040
8041 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_set_credential_hash_ops, char, string_hash_func, string_compare_func, ExecSetCredential, exec_set_credential_free);
8042 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_load_credential_hash_ops, char, string_hash_func, string_compare_func, ExecLoadCredential, exec_load_credential_free);
8043
8044 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
8045         [EXEC_INPUT_NULL] = "null",
8046         [EXEC_INPUT_TTY] = "tty",
8047         [EXEC_INPUT_TTY_FORCE] = "tty-force",
8048         [EXEC_INPUT_TTY_FAIL] = "tty-fail",
8049         [EXEC_INPUT_SOCKET] = "socket",
8050         [EXEC_INPUT_NAMED_FD] = "fd",
8051         [EXEC_INPUT_DATA] = "data",
8052         [EXEC_INPUT_FILE] = "file",
8053 };
8054
8055 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
8056
8057 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
8058         [EXEC_OUTPUT_INHERIT] = "inherit",
8059         [EXEC_OUTPUT_NULL] = "null",
8060         [EXEC_OUTPUT_TTY] = "tty",
8061         [EXEC_OUTPUT_KMSG] = "kmsg",
8062         [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
8063         [EXEC_OUTPUT_JOURNAL] = "journal",
8064         [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
8065         [EXEC_OUTPUT_SOCKET] = "socket",
8066         [EXEC_OUTPUT_NAMED_FD] = "fd",
8067         [EXEC_OUTPUT_FILE] = "file",
8068         [EXEC_OUTPUT_FILE_APPEND] = "append",
8069         [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
8070 };
8071
8072 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
8073
8074 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
8075         [EXEC_UTMP_INIT] = "init",
8076         [EXEC_UTMP_LOGIN] = "login",
8077         [EXEC_UTMP_USER] = "user",
8078 };
8079
8080 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
8081
8082 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
8083         [EXEC_PRESERVE_NO] = "no",
8084         [EXEC_PRESERVE_YES] = "yes",
8085         [EXEC_PRESERVE_RESTART] = "restart",
8086 };
8087
8088 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
8089
8090 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
8091 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
8092         [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
8093         [EXEC_DIRECTORY_STATE] = "StateDirectory",
8094         [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
8095         [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
8096         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
8097 };
8098
8099 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
8100
8101 /* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
8102 static const char* const exec_directory_type_symlink_table[_EXEC_DIRECTORY_TYPE_MAX] = {
8103         [EXEC_DIRECTORY_RUNTIME]       = "RuntimeDirectorySymlink",
8104         [EXEC_DIRECTORY_STATE]         = "StateDirectorySymlink",
8105         [EXEC_DIRECTORY_CACHE]         = "CacheDirectorySymlink",
8106         [EXEC_DIRECTORY_LOGS]          = "LogsDirectorySymlink",
8107         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectorySymlink",
8108 };
8109
8110 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink, ExecDirectoryType);
8111
8112 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
8113  * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
8114  * directories, specifically .timer units with their timestamp touch file. */
8115 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
8116         [EXEC_DIRECTORY_RUNTIME] = "runtime",
8117         [EXEC_DIRECTORY_STATE] = "state",
8118         [EXEC_DIRECTORY_CACHE] = "cache",
8119         [EXEC_DIRECTORY_LOGS] = "logs",
8120         [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
8121 };
8122
8123 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
8124
8125 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
8126  * the service payload in. */
8127 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
8128         [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
8129         [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
8130         [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
8131         [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
8132         [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
8133 };
8134
8135 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
8136
8137 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
8138         [EXEC_KEYRING_INHERIT] = "inherit",
8139         [EXEC_KEYRING_PRIVATE] = "private",
8140         [EXEC_KEYRING_SHARED] = "shared",
8141 };
8142
8143 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);