src/core/execute.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <errno.h>
   4 #include <fcntl.h>
   5 #include <poll.h>
   6 #include <sys/eventfd.h>
   7 #include <sys/ioctl.h>
   8 #include <sys/mman.h>
   9 #include <sys/mount.h>
  10 #include <sys/personality.h>
  11 #include <sys/prctl.h>
  12 #include <sys/shm.h>
  13 #include <sys/types.h>
  14 #include <sys/un.h>
  15 #include <unistd.h>
  16 #include <utmpx.h>
  17
  18 #include <linux/fs.h> /* Must be included after <sys/mount.h> */
  19
  20 #if HAVE_PAM
  21 #include <security/pam_appl.h>
  22 #endif
  23
  24 #if HAVE_SELINUX
  25 #include <selinux/selinux.h>
  26 #endif
  27
  28 #if HAVE_SECCOMP
  29 #include <seccomp.h>
  30 #endif
  31
  32 #if HAVE_APPARMOR
  33 #include <sys/apparmor.h>
  34 #endif
  35
  36 #include "sd-messages.h"
  37
  38 #include "acl-util.h"
  39 #include "af-list.h"
  40 #include "alloc-util.h"
  41 #if HAVE_APPARMOR
  42 #include "apparmor-util.h"
  43 #endif
  44 #include "argv-util.h"
  45 #include "async.h"
  46 #include "barrier.h"
  47 #include "bpf-lsm.h"
  48 #include "btrfs-util.h"
  49 #include "cap-list.h"
  50 #include "capability-util.h"
  51 #include "chattr-util.h"
  52 #include "cgroup-setup.h"
  53 #include "chase.h"
  54 #include "chown-recursive.h"
  55 #include "constants.h"
  56 #include "cpu-set-util.h"
  57 #include "creds-util.h"
  58 #include "data-fd-util.h"
  59 #include "env-file.h"
  60 #include "env-util.h"
  61 #include "errno-list.h"
  62 #include "escape.h"
  63 #include "execute.h"
  64 #include "exit-status.h"
  65 #include "fd-util.h"
  66 #include "fileio.h"
  67 #include "format-util.h"
  68 #include "glob-util.h"
  69 #include "hexdecoct.h"
  70 #include "io-util.h"
  71 #include "ioprio-util.h"
  72 #include "label-util.h"
  73 #include "lock-util.h"
  74 #include "log.h"
  75 #include "macro.h"
  76 #include "manager.h"
  77 #include "manager-dump.h"
  78 #include "memory-util.h"
  79 #include "missing_fs.h"
  80 #include "missing_ioprio.h"
  81 #include "missing_prctl.h"
  82 #include "mkdir-label.h"
  83 #include "mount-util.h"
  84 #include "mountpoint-util.h"
  85 #include "namespace.h"
  86 #include "parse-util.h"
  87 #include "path-util.h"
  88 #include "proc-cmdline.h"
  89 #include "process-util.h"
  90 #include "psi-util.h"
  91 #include "random-util.h"
  92 #include "recurse-dir.h"
  93 #include "rlimit-util.h"
  94 #include "rm-rf.h"
  95 #if HAVE_SECCOMP
  96 #include "seccomp-util.h"
  97 #endif
  98 #include "securebits-util.h"
  99 #include "selinux-util.h"
 100 #include "signal-util.h"
 101 #include "smack-util.h"
 102 #include "socket-util.h"
 103 #include "sort-util.h"
 104 #include "special.h"
 105 #include "stat-util.h"
 106 #include "string-table.h"
 107 #include "string-util.h"
 108 #include "strv.h"
 109 #include "syslog-util.h"
 110 #include "terminal-util.h"
 111 #include "tmpfile-util.h"
 112 #include "umask-util.h"
 113 #include "unit-serialize.h"
 114 #include "user-util.h"
 115 #include "utmp-wtmp.h"
 116
 117 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
 118 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
 119
 120 #define SNDBUF_SIZE (8*1024*1024)
 121
 122 static int shift_fds(int fds[], size_t n_fds) {
 123         if (n_fds <= 0)
 124                 return 0;
 125
 126         /* Modifies the fds array! (sorts it) */
 127
 128         assert(fds);
 129
 130         for (int start = 0;;) {
 131                 int restart_from = -1;
 132
 133                 for (int i = start; i < (int) n_fds; i++) {
 134                         int nfd;
 135
 136                         /* Already at right index? */
 137                         if (fds[i] == i+3)
 138                                 continue;
 139
 140                         nfd = fcntl(fds[i], F_DUPFD, i + 3);
 141                         if (nfd < 0)
 142                                 return -errno;
 143
 144                         safe_close(fds[i]);
 145                         fds[i] = nfd;
 146
 147                         /* Hmm, the fd we wanted isn't free? Then
 148                          * let's remember that and try again from here */
 149                         if (nfd != i+3 && restart_from < 0)
 150                                 restart_from = i;
 151                 }
 152
 153                 if (restart_from < 0)
 154                         break;
 155
 156                 start = restart_from;
 157         }
 158
 159         return 0;
 160 }
 161
 162 static int flags_fds(
 163                 const int fds[],
 164                 size_t n_socket_fds,
 165                 size_t n_fds,
 166                 bool nonblock) {
 167
 168         int r;
 169
 170         if (n_fds <= 0)
 171                 return 0;
 172
 173         assert(fds);
 174
 175         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
 176          * O_NONBLOCK only applies to socket activation though. */
 177
 178         for (size_t i = 0; i < n_fds; i++) {
 179
 180                 if (i < n_socket_fds) {
 181                         r = fd_nonblock(fds[i], nonblock);
 182                         if (r < 0)
 183                                 return r;
 184                 }
 185
 186                 /* We unconditionally drop FD_CLOEXEC from the fds,
 187                  * since after all we want to pass these fds to our
 188                  * children */
 189
 190                 r = fd_cloexec(fds[i], false);
 191                 if (r < 0)
 192                         return r;
 193         }
 194
 195         return 0;
 196 }
 197
 198 static const char *exec_context_tty_path(const ExecContext *context) {
 199         assert(context);
 200
 201         if (context->stdio_as_fds)
 202                 return NULL;
 203
 204         if (context->tty_path)
 205                 return context->tty_path;
 206
 207         return "/dev/console";
 208 }
 209
 210 static int exec_context_tty_size(const ExecContext *context, unsigned *ret_rows, unsigned *ret_cols) {
 211         unsigned rows, cols;
 212         const char *tty;
 213
 214         assert(context);
 215         assert(ret_rows);
 216         assert(ret_cols);
 217
 218         rows = context->tty_rows;
 219         cols = context->tty_cols;
 220
 221         tty = exec_context_tty_path(context);
 222         if (tty)
 223                 (void) proc_cmdline_tty_size(tty, rows == UINT_MAX ? &rows : NULL, cols == UINT_MAX ? &cols : NULL);
 224
 225         *ret_rows = rows;
 226         *ret_cols = cols;
 227
 228         return 0;
 229 }
 230
 231 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
 232         _cleanup_close_ int fd = -EBADF;
 233         const char *path = exec_context_tty_path(ASSERT_PTR(context));
 234
 235         /* Take a lock around the device for the duration of the setup that we do here.
 236          * systemd-vconsole-setup.service also takes the lock to avoid being interrupted.
 237          * We open a new fd that will be closed automatically, and operate on it for convenience.
 238          */
 239
 240         if (p && p->stdin_fd >= 0) {
 241                 fd = xopenat_lock(p->stdin_fd, NULL,
 242                                   O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY, 0, 0, LOCK_BSD, LOCK_EX);
 243                 if (fd < 0)
 244                         return;
 245         } else if (path) {
 246                 fd = open_terminal(path, O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK);
 247                 if (fd < 0)
 248                         return;
 249
 250                 if (lock_generic(fd, LOCK_BSD, LOCK_EX) < 0)
 251                         return;
 252         } else
 253                 return;   /* nothing to do */
 254
 255         if (context->tty_vhangup)
 256                 (void) terminal_vhangup_fd(fd);
 257
 258         if (context->tty_reset)
 259                 (void) reset_terminal_fd(fd, true);
 260
 261         if (p && p->stdin_fd >= 0) {
 262                 unsigned rows = context->tty_rows, cols = context->tty_cols;
 263
 264                 (void) exec_context_tty_size(context, &rows, &cols);
 265                 (void) terminal_set_size_fd(p->stdin_fd, path, rows, cols);
 266         }
 267
 268         if (context->tty_vt_disallocate && path)
 269                 (void) vt_disallocate(path);
 270 }
 271
 272 static bool is_terminal_input(ExecInput i) {
 273         return IN_SET(i,
 274                       EXEC_INPUT_TTY,
 275                       EXEC_INPUT_TTY_FORCE,
 276                       EXEC_INPUT_TTY_FAIL);
 277 }
 278
 279 static bool is_terminal_output(ExecOutput o) {
 280         return IN_SET(o,
 281                       EXEC_OUTPUT_TTY,
 282                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 283                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 284 }
 285
 286 static bool is_kmsg_output(ExecOutput o) {
 287         return IN_SET(o,
 288                       EXEC_OUTPUT_KMSG,
 289                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 290 }
 291
 292 static bool exec_context_needs_term(const ExecContext *c) {
 293         assert(c);
 294
 295         /* Return true if the execution context suggests we should set $TERM to something useful. */
 296
 297         if (is_terminal_input(c->std_input))
 298                 return true;
 299
 300         if (is_terminal_output(c->std_output))
 301                 return true;
 302
 303         if (is_terminal_output(c->std_error))
 304                 return true;
 305
 306         return !!c->tty_path;
 307 }
 308
 309 static int open_null_as(int flags, int nfd) {
 310         int fd;
 311
 312         assert(nfd >= 0);
 313
 314         fd = open("/dev/null", flags|O_NOCTTY);
 315         if (fd < 0)
 316                 return -errno;
 317
 318         return move_fd(fd, nfd, false);
 319 }
 320
 321 static int connect_journal_socket(
 322                 int fd,
 323                 const char *log_namespace,
 324                 uid_t uid,
 325                 gid_t gid) {
 326
 327         uid_t olduid = UID_INVALID;
 328         gid_t oldgid = GID_INVALID;
 329         const char *j;
 330         int r;
 331
 332         j = log_namespace ?
 333                 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
 334                 "/run/systemd/journal/stdout";
 335
 336         if (gid_is_valid(gid)) {
 337                 oldgid = getgid();
 338
 339                 if (setegid(gid) < 0)
 340                         return -errno;
 341         }
 342
 343         if (uid_is_valid(uid)) {
 344                 olduid = getuid();
 345
 346                 if (seteuid(uid) < 0) {
 347                         r = -errno;
 348                         goto restore_gid;
 349                 }
 350         }
 351
 352         r = connect_unix_path(fd, AT_FDCWD, j);
 353
 354         /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
 355            an LSM interferes. */
 356
 357         if (uid_is_valid(uid))
 358                 (void) seteuid(olduid);
 359
 360  restore_gid:
 361         if (gid_is_valid(gid))
 362                 (void) setegid(oldgid);
 363
 364         return r;
 365 }
 366
 367 static int connect_logger_as(
 368                 const Unit *unit,
 369                 const ExecContext *context,
 370                 const ExecParameters *params,
 371                 ExecOutput output,
 372                 const char *ident,
 373                 int nfd,
 374                 uid_t uid,
 375                 gid_t gid) {
 376
 377         _cleanup_close_ int fd = -EBADF;
 378         int r;
 379
 380         assert(context);
 381         assert(params);
 382         assert(output < _EXEC_OUTPUT_MAX);
 383         assert(ident);
 384         assert(nfd >= 0);
 385
 386         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 387         if (fd < 0)
 388                 return -errno;
 389
 390         r = connect_journal_socket(fd, context->log_namespace, uid, gid);
 391         if (r < 0)
 392                 return r;
 393
 394         if (shutdown(fd, SHUT_RD) < 0)
 395                 return -errno;
 396
 397         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 398
 399         if (dprintf(fd,
 400                 "%s\n"
 401                 "%s\n"
 402                 "%i\n"
 403                 "%i\n"
 404                 "%i\n"
 405                 "%i\n"
 406                 "%i\n",
 407                 context->syslog_identifier ?: ident,
 408                 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
 409                 context->syslog_priority,
 410                 !!context->syslog_level_prefix,
 411                 false,
 412                 is_kmsg_output(output),
 413                 is_terminal_output(output)) < 0)
 414                 return -errno;
 415
 416         return move_fd(TAKE_FD(fd), nfd, false);
 417 }
 418
 419 static int open_terminal_as(const char *path, int flags, int nfd) {
 420         int fd;
 421
 422         assert(path);
 423         assert(nfd >= 0);
 424
 425         fd = open_terminal(path, flags | O_NOCTTY);
 426         if (fd < 0)
 427                 return fd;
 428
 429         return move_fd(fd, nfd, false);
 430 }
 431
 432 static int acquire_path(const char *path, int flags, mode_t mode) {
 433         _cleanup_close_ int fd = -EBADF;
 434         int r;
 435
 436         assert(path);
 437
 438         if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
 439                 flags |= O_CREAT;
 440
 441         fd = open(path, flags|O_NOCTTY, mode);
 442         if (fd >= 0)
 443                 return TAKE_FD(fd);
 444
 445         if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
 446                 return -errno;
 447
 448         /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
 449
 450         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 451         if (fd < 0)
 452                 return -errno;
 453
 454         r = connect_unix_path(fd, AT_FDCWD, path);
 455         if (IN_SET(r, -ENOTSOCK, -EINVAL))
 456                 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
 457                  * wasn't an AF_UNIX socket after all */
 458                 return -ENXIO;
 459         if (r < 0)
 460                 return r;
 461
 462         if ((flags & O_ACCMODE) == O_RDONLY)
 463                 r = shutdown(fd, SHUT_WR);
 464         else if ((flags & O_ACCMODE) == O_WRONLY)
 465                 r = shutdown(fd, SHUT_RD);
 466         else
 467                 r = 0;
 468         if (r < 0)
 469                 return -errno;
 470
 471         return TAKE_FD(fd);
 472 }
 473
 474 static int fixup_input(
 475                 const ExecContext *context,
 476                 int socket_fd,
 477                 bool apply_tty_stdin) {
 478
 479         ExecInput std_input;
 480
 481         assert(context);
 482
 483         std_input = context->std_input;
 484
 485         if (is_terminal_input(std_input) && !apply_tty_stdin)
 486                 return EXEC_INPUT_NULL;
 487
 488         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 489                 return EXEC_INPUT_NULL;
 490
 491         if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
 492                 return EXEC_INPUT_NULL;
 493
 494         return std_input;
 495 }
 496
 497 static int fixup_output(ExecOutput output, int socket_fd) {
 498
 499         if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 500                 return EXEC_OUTPUT_INHERIT;
 501
 502         return output;
 503 }
 504
 505 static int setup_input(
 506                 const ExecContext *context,
 507                 const ExecParameters *params,
 508                 int socket_fd,
 509                 const int named_iofds[static 3]) {
 510
 511         ExecInput i;
 512         int r;
 513
 514         assert(context);
 515         assert(params);
 516         assert(named_iofds);
 517
 518         if (params->stdin_fd >= 0) {
 519                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 520                         return -errno;
 521
 522                 /* Try to make this the controlling tty, if it is a tty, and reset it */
 523                 if (isatty(STDIN_FILENO)) {
 524                         unsigned rows = context->tty_rows, cols = context->tty_cols;
 525
 526                         (void) exec_context_tty_size(context, &rows, &cols);
 527                         (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 528                         (void) reset_terminal_fd(STDIN_FILENO, true);
 529                         (void) terminal_set_size_fd(STDIN_FILENO, NULL, rows, cols);
 530                 }
 531
 532                 return STDIN_FILENO;
 533         }
 534
 535         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 536
 537         switch (i) {
 538
 539         case EXEC_INPUT_NULL:
 540                 return open_null_as(O_RDONLY, STDIN_FILENO);
 541
 542         case EXEC_INPUT_TTY:
 543         case EXEC_INPUT_TTY_FORCE:
 544         case EXEC_INPUT_TTY_FAIL: {
 545                 unsigned rows, cols;
 546                 int fd;
 547
 548                 fd = acquire_terminal(exec_context_tty_path(context),
 549                                       i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
 550                                       i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
 551                                                                   ACQUIRE_TERMINAL_WAIT,
 552                                       USEC_INFINITY);
 553                 if (fd < 0)
 554                         return fd;
 555
 556                 r = exec_context_tty_size(context, &rows, &cols);
 557                 if (r < 0)
 558                         return r;
 559
 560                 r = terminal_set_size_fd(fd, exec_context_tty_path(context), rows, cols);
 561                 if (r < 0)
 562                         return r;
 563
 564                 return move_fd(fd, STDIN_FILENO, false);
 565         }
 566
 567         case EXEC_INPUT_SOCKET:
 568                 assert(socket_fd >= 0);
 569
 570                 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
 571
 572         case EXEC_INPUT_NAMED_FD:
 573                 assert(named_iofds[STDIN_FILENO] >= 0);
 574
 575                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 576                 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
 577
 578         case EXEC_INPUT_DATA: {
 579                 int fd;
 580
 581                 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
 582                 if (fd < 0)
 583                         return fd;
 584
 585                 return move_fd(fd, STDIN_FILENO, false);
 586         }
 587
 588         case EXEC_INPUT_FILE: {
 589                 bool rw;
 590                 int fd;
 591
 592                 assert(context->stdio_file[STDIN_FILENO]);
 593
 594                 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
 595                         (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
 596
 597                 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
 598                 if (fd < 0)
 599                         return fd;
 600
 601                 return move_fd(fd, STDIN_FILENO, false);
 602         }
 603
 604         default:
 605                 assert_not_reached();
 606         }
 607 }
 608
 609 static bool can_inherit_stderr_from_stdout(
 610                 const ExecContext *context,
 611                 ExecOutput o,
 612                 ExecOutput e) {
 613
 614         assert(context);
 615
 616         /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
 617          * stderr fd */
 618
 619         if (e == EXEC_OUTPUT_INHERIT)
 620                 return true;
 621         if (e != o)
 622                 return false;
 623
 624         if (e == EXEC_OUTPUT_NAMED_FD)
 625                 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
 626
 627         if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
 628                 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
 629
 630         return true;
 631 }
 632
 633 static int setup_output(
 634                 const Unit *unit,
 635                 const ExecContext *context,
 636                 const ExecParameters *params,
 637                 int fileno,
 638                 int socket_fd,
 639                 const int named_iofds[static 3],
 640                 const char *ident,
 641                 uid_t uid,
 642                 gid_t gid,
 643                 dev_t *journal_stream_dev,
 644                 ino_t *journal_stream_ino) {
 645
 646         ExecOutput o;
 647         ExecInput i;
 648         int r;
 649
 650         assert(unit);
 651         assert(context);
 652         assert(params);
 653         assert(ident);
 654         assert(journal_stream_dev);
 655         assert(journal_stream_ino);
 656
 657         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 658
 659                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 660                         return -errno;
 661
 662                 return STDOUT_FILENO;
 663         }
 664
 665         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 666                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 667                         return -errno;
 668
 669                 return STDERR_FILENO;
 670         }
 671
 672         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 673         o = fixup_output(context->std_output, socket_fd);
 674
 675         if (fileno == STDERR_FILENO) {
 676                 ExecOutput e;
 677                 e = fixup_output(context->std_error, socket_fd);
 678
 679                 /* This expects the input and output are already set up */
 680
 681                 /* Don't change the stderr file descriptor if we inherit all
 682                  * the way and are not on a tty */
 683                 if (e == EXEC_OUTPUT_INHERIT &&
 684                     o == EXEC_OUTPUT_INHERIT &&
 685                     i == EXEC_INPUT_NULL &&
 686                     !is_terminal_input(context->std_input) &&
 687                     getppid() != 1)
 688                         return fileno;
 689
 690                 /* Duplicate from stdout if possible */
 691                 if (can_inherit_stderr_from_stdout(context, o, e))
 692                         return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
 693
 694                 o = e;
 695
 696         } else if (o == EXEC_OUTPUT_INHERIT) {
 697                 /* If input got downgraded, inherit the original value */
 698                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 699                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 700
 701                 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
 702                 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
 703                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 704
 705                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 706                 if (getppid() != 1)
 707                         return fileno;
 708
 709                 /* We need to open /dev/null here anew, to get the right access mode. */
 710                 return open_null_as(O_WRONLY, fileno);
 711         }
 712
 713         switch (o) {
 714
 715         case EXEC_OUTPUT_NULL:
 716                 return open_null_as(O_WRONLY, fileno);
 717
 718         case EXEC_OUTPUT_TTY:
 719                 if (is_terminal_input(i))
 720                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 721
 722                 /* We don't reset the terminal if this is just about output */
 723                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 724
 725         case EXEC_OUTPUT_KMSG:
 726         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 727         case EXEC_OUTPUT_JOURNAL:
 728         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 729                 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
 730                 if (r < 0) {
 731                         log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m",
 732                                                fileno == STDOUT_FILENO ? "stdout" : "stderr");
 733                         r = open_null_as(O_WRONLY, fileno);
 734                 } else {
 735                         struct stat st;
 736
 737                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 738                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 739                          * services to detect whether they are connected to the journal or not.
 740                          *
 741                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
 742                          * about STDERR as that's usually the best way to do logging. */
 743
 744                         if (fstat(fileno, &st) >= 0 &&
 745                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
 746                                 *journal_stream_dev = st.st_dev;
 747                                 *journal_stream_ino = st.st_ino;
 748                         }
 749                 }
 750                 return r;
 751
 752         case EXEC_OUTPUT_SOCKET:
 753                 assert(socket_fd >= 0);
 754
 755                 return RET_NERRNO(dup2(socket_fd, fileno));
 756
 757         case EXEC_OUTPUT_NAMED_FD:
 758                 assert(named_iofds[fileno] >= 0);
 759
 760                 (void) fd_nonblock(named_iofds[fileno], false);
 761                 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
 762
 763         case EXEC_OUTPUT_FILE:
 764         case EXEC_OUTPUT_FILE_APPEND:
 765         case EXEC_OUTPUT_FILE_TRUNCATE: {
 766                 bool rw;
 767                 int fd, flags;
 768
 769                 assert(context->stdio_file[fileno]);
 770
 771                 rw = context->std_input == EXEC_INPUT_FILE &&
 772                         streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
 773
 774                 if (rw)
 775                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 776
 777                 flags = O_WRONLY;
 778                 if (o == EXEC_OUTPUT_FILE_APPEND)
 779                         flags |= O_APPEND;
 780                 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
 781                         flags |= O_TRUNC;
 782
 783                 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
 784                 if (fd < 0)
 785                         return fd;
 786
 787                 return move_fd(fd, fileno, 0);
 788         }
 789
 790         default:
 791                 assert_not_reached();
 792         }
 793 }
 794
 795 static int chown_terminal(int fd, uid_t uid) {
 796         int r;
 797
 798         assert(fd >= 0);
 799
 800         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 801         if (isatty(fd) < 1) {
 802                 if (IN_SET(errno, EINVAL, ENOTTY))
 803                         return 0; /* not a tty */
 804
 805                 return -errno;
 806         }
 807
 808         /* This might fail. What matters are the results. */
 809         r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
 810         if (r < 0)
 811                 return r;
 812
 813         return 1;
 814 }
 815
 816 static int setup_confirm_stdio(
 817                 const ExecContext *context,
 818                 const char *vc,
 819                 int *ret_saved_stdin,
 820                 int *ret_saved_stdout) {
 821
 822         _cleanup_close_ int fd = -EBADF, saved_stdin = -EBADF, saved_stdout = -EBADF;
 823         unsigned rows, cols;
 824         int r;
 825
 826         assert(ret_saved_stdin);
 827         assert(ret_saved_stdout);
 828
 829         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 830         if (saved_stdin < 0)
 831                 return -errno;
 832
 833         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
 834         if (saved_stdout < 0)
 835                 return -errno;
 836
 837         fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
 838         if (fd < 0)
 839                 return fd;
 840
 841         r = chown_terminal(fd, getuid());
 842         if (r < 0)
 843                 return r;
 844
 845         r = reset_terminal_fd(fd, true);
 846         if (r < 0)
 847                 return r;
 848
 849         r = exec_context_tty_size(context, &rows, &cols);
 850         if (r < 0)
 851                 return r;
 852
 853         r = terminal_set_size_fd(fd, vc, rows, cols);
 854         if (r < 0)
 855                 return r;
 856
 857         r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
 858         TAKE_FD(fd);
 859         if (r < 0)
 860                 return r;
 861
 862         *ret_saved_stdin = TAKE_FD(saved_stdin);
 863         *ret_saved_stdout = TAKE_FD(saved_stdout);
 864         return 0;
 865 }
 866
 867 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
 868         assert(err < 0);
 869
 870         if (err == -ETIMEDOUT)
 871                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
 872         else {
 873                 errno = -err;
 874                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
 875         }
 876 }
 877
 878 static void write_confirm_error(int err, const char *vc, const Unit *u) {
 879         _cleanup_close_ int fd = -EBADF;
 880
 881         assert(vc);
 882
 883         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 884         if (fd < 0)
 885                 return;
 886
 887         write_confirm_error_fd(err, fd, u);
 888 }
 889
 890 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 891         int r = 0;
 892
 893         assert(saved_stdin);
 894         assert(saved_stdout);
 895
 896         release_terminal();
 897
 898         if (*saved_stdin >= 0)
 899                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 900                         r = -errno;
 901
 902         if (*saved_stdout >= 0)
 903                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 904                         r = -errno;
 905
 906         *saved_stdin = safe_close(*saved_stdin);
 907         *saved_stdout = safe_close(*saved_stdout);
 908
 909         return r;
 910 }
 911
 912 enum {
 913         CONFIRM_PRETEND_FAILURE = -1,
 914         CONFIRM_PRETEND_SUCCESS =  0,
 915         CONFIRM_EXECUTE = 1,
 916 };
 917
 918 static int ask_for_confirmation(const ExecContext *context, const char *vc, Unit *u, const char *cmdline) {
 919         int saved_stdout = -1, saved_stdin = -1, r;
 920         _cleanup_free_ char *e = NULL;
 921         char c;
 922
 923         /* For any internal errors, assume a positive response. */
 924         r = setup_confirm_stdio(context, vc, &saved_stdin, &saved_stdout);
 925         if (r < 0) {
 926                 write_confirm_error(r, vc, u);
 927                 return CONFIRM_EXECUTE;
 928         }
 929
 930         /* confirm_spawn might have been disabled while we were sleeping. */
 931         if (manager_is_confirm_spawn_disabled(u->manager)) {
 932                 r = 1;
 933                 goto restore_stdio;
 934         }
 935
 936         e = ellipsize(cmdline, 60, 100);
 937         if (!e) {
 938                 log_oom();
 939                 r = CONFIRM_EXECUTE;
 940                 goto restore_stdio;
 941         }
 942
 943         for (;;) {
 944                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 945                 if (r < 0) {
 946                         write_confirm_error_fd(r, STDOUT_FILENO, u);
 947                         r = CONFIRM_EXECUTE;
 948                         goto restore_stdio;
 949                 }
 950
 951                 switch (c) {
 952                 case 'c':
 953                         printf("Resuming normal execution.\n");
 954                         manager_disable_confirm_spawn();
 955                         r = 1;
 956                         break;
 957                 case 'D':
 958                         unit_dump(u, stdout, "  ");
 959                         continue; /* ask again */
 960                 case 'f':
 961                         printf("Failing execution.\n");
 962                         r = CONFIRM_PRETEND_FAILURE;
 963                         break;
 964                 case 'h':
 965                         printf("  c - continue, proceed without asking anymore\n"
 966                                "  D - dump, show the state of the unit\n"
 967                                "  f - fail, don't execute the command and pretend it failed\n"
 968                                "  h - help\n"
 969                                "  i - info, show a short summary of the unit\n"
 970                                "  j - jobs, show jobs that are in progress\n"
 971                                "  s - skip, don't execute the command and pretend it succeeded\n"
 972                                "  y - yes, execute the command\n");
 973                         continue; /* ask again */
 974                 case 'i':
 975                         printf("  Description: %s\n"
 976                                "  Unit:        %s\n"
 977                                "  Command:     %s\n",
 978                                u->id, u->description, cmdline);
 979                         continue; /* ask again */
 980                 case 'j':
 981                         manager_dump_jobs(u->manager, stdout, /* patterns= */ NULL, "  ");
 982                         continue; /* ask again */
 983                 case 'n':
 984                         /* 'n' was removed in favor of 'f'. */
 985                         printf("Didn't understand 'n', did you mean 'f'?\n");
 986                         continue; /* ask again */
 987                 case 's':
 988                         printf("Skipping execution.\n");
 989                         r = CONFIRM_PRETEND_SUCCESS;
 990                         break;
 991                 case 'y':
 992                         r = CONFIRM_EXECUTE;
 993                         break;
 994                 default:
 995                         assert_not_reached();
 996                 }
 997                 break;
 998         }
 999
1000 restore_stdio:
1001         restore_confirm_stdio(&saved_stdin, &saved_stdout);
1002         return r;
1003 }
1004
1005 static int get_fixed_user(const ExecContext *c, const char **user,
1006                           uid_t *uid, gid_t *gid,
1007                           const char **home, const char **shell) {
1008         int r;
1009         const char *name;
1010
1011         assert(c);
1012
1013         if (!c->user)
1014                 return 0;
1015
1016         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
1017          * (i.e. are "/" or "/bin/nologin"). */
1018
1019         name = c->user;
1020         r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
1021         if (r < 0)
1022                 return r;
1023
1024         *user = name;
1025         return 0;
1026 }
1027
1028 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
1029         int r;
1030         const char *name;
1031
1032         assert(c);
1033
1034         if (!c->group)
1035                 return 0;
1036
1037         name = c->group;
1038         r = get_group_creds(&name, gid, 0);
1039         if (r < 0)
1040                 return r;
1041
1042         *group = name;
1043         return 0;
1044 }
1045
1046 static int get_supplementary_groups(const ExecContext *c, const char *user,
1047                                     const char *group, gid_t gid,
1048                                     gid_t **supplementary_gids, int *ngids) {
1049         int r, k = 0;
1050         int ngroups_max;
1051         bool keep_groups = false;
1052         gid_t *groups = NULL;
1053         _cleanup_free_ gid_t *l_gids = NULL;
1054
1055         assert(c);
1056
1057         /*
1058          * If user is given, then lookup GID and supplementary groups list.
1059          * We avoid NSS lookups for gid=0. Also we have to initialize groups
1060          * here and as early as possible so we keep the list of supplementary
1061          * groups of the caller.
1062          */
1063         if (user && gid_is_valid(gid) && gid != 0) {
1064                 /* First step, initialize groups from /etc/groups */
1065                 if (initgroups(user, gid) < 0)
1066                         return -errno;
1067
1068                 keep_groups = true;
1069         }
1070
1071         if (strv_isempty(c->supplementary_groups))
1072                 return 0;
1073
1074         /*
1075          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1076          * be positive, otherwise fail.
1077          */
1078         errno = 0;
1079         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1080         if (ngroups_max <= 0)
1081                 return errno_or_else(EOPNOTSUPP);
1082
1083         l_gids = new(gid_t, ngroups_max);
1084         if (!l_gids)
1085                 return -ENOMEM;
1086
1087         if (keep_groups) {
1088                 /*
1089                  * Lookup the list of groups that the user belongs to, we
1090                  * avoid NSS lookups here too for gid=0.
1091                  */
1092                 k = ngroups_max;
1093                 if (getgrouplist(user, gid, l_gids, &k) < 0)
1094                         return -EINVAL;
1095         } else
1096                 k = 0;
1097
1098         STRV_FOREACH(i, c->supplementary_groups) {
1099                 const char *g;
1100
1101                 if (k >= ngroups_max)
1102                         return -E2BIG;
1103
1104                 g = *i;
1105                 r = get_group_creds(&g, l_gids+k, 0);
1106                 if (r < 0)
1107                         return r;
1108
1109                 k++;
1110         }
1111
1112         /*
1113          * Sets ngids to zero to drop all supplementary groups, happens
1114          * when we are under root and SupplementaryGroups= is empty.
1115          */
1116         if (k == 0) {
1117                 *ngids = 0;
1118                 return 0;
1119         }
1120
1121         /* Otherwise get the final list of supplementary groups */
1122         groups = memdup(l_gids, sizeof(gid_t) * k);
1123         if (!groups)
1124                 return -ENOMEM;
1125
1126         *supplementary_gids = groups;
1127         *ngids = k;
1128
1129         groups = NULL;
1130
1131         return 0;
1132 }
1133
1134 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1135         int r;
1136
1137         /* Handle SupplementaryGroups= if it is not empty */
1138         if (ngids > 0) {
1139                 r = maybe_setgroups(ngids, supplementary_gids);
1140                 if (r < 0)
1141                         return r;
1142         }
1143
1144         if (gid_is_valid(gid)) {
1145                 /* Then set our gids */
1146                 if (setresgid(gid, gid, gid) < 0)
1147                         return -errno;
1148         }
1149
1150         return 0;
1151 }
1152
1153 static int set_securebits(unsigned bits, unsigned mask) {
1154         unsigned applied;
1155         int current;
1156
1157         current = prctl(PR_GET_SECUREBITS);
1158         if (current < 0)
1159                 return -errno;
1160
1161         /* Clear all securebits defined in mask and set bits */
1162         applied = ((unsigned) current & ~mask) | bits;
1163         if ((unsigned) current == applied)
1164                 return 0;
1165
1166         if (prctl(PR_SET_SECUREBITS, applied) < 0)
1167                 return -errno;
1168
1169         return 1;
1170 }
1171
1172 static int enforce_user(
1173                 const ExecContext *context,
1174                 uid_t uid,
1175                 uint64_t capability_ambient_set) {
1176         assert(context);
1177         int r;
1178
1179         if (!uid_is_valid(uid))
1180                 return 0;
1181
1182         /* Sets (but doesn't look up) the UIS and makes sure we keep the capabilities while doing so. For
1183          * setting secure bits the capability CAP_SETPCAP is required, so we also need keep-caps in this
1184          * case. */
1185
1186         if ((capability_ambient_set != 0 || context->secure_bits != 0) && uid != 0) {
1187
1188                 /* First step: If we need to keep capabilities but drop privileges we need to make sure we
1189                  * keep our caps, while we drop privileges. Add KEEP_CAPS to the securebits */
1190                 r = set_securebits(1U << SECURE_KEEP_CAPS, 0);
1191                 if (r < 0)
1192                         return r;
1193         }
1194
1195         /* Second step: actually set the uids */
1196         if (setresuid(uid, uid, uid) < 0)
1197                 return -errno;
1198
1199         /* At this point we should have all necessary capabilities but are otherwise a normal user. However,
1200          * the caps might got corrupted due to the setresuid() so we need clean them up later. This is done
1201          * outside of this call. */
1202         return 0;
1203 }
1204
1205 #if HAVE_PAM
1206
1207 static int null_conv(
1208                 int num_msg,
1209                 const struct pam_message **msg,
1210                 struct pam_response **resp,
1211                 void *appdata_ptr) {
1212
1213         /* We don't support conversations */
1214
1215         return PAM_CONV_ERR;
1216 }
1217
1218 #endif
1219
1220 static int setup_pam(
1221                 const char *name,
1222                 const char *user,
1223                 uid_t uid,
1224                 gid_t gid,
1225                 const char *tty,
1226                 char ***env, /* updated on success */
1227                 const int fds[], size_t n_fds) {
1228
1229 #if HAVE_PAM
1230
1231         static const struct pam_conv conv = {
1232                 .conv = null_conv,
1233                 .appdata_ptr = NULL
1234         };
1235
1236         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1237         _cleanup_strv_free_ char **e = NULL;
1238         pam_handle_t *handle = NULL;
1239         sigset_t old_ss;
1240         int pam_code = PAM_SUCCESS, r;
1241         bool close_session = false;
1242         pid_t pam_pid = 0, parent_pid;
1243         int flags = 0;
1244
1245         assert(name);
1246         assert(user);
1247         assert(env);
1248
1249         /* We set up PAM in the parent process, then fork. The child
1250          * will then stay around until killed via PR_GET_PDEATHSIG or
1251          * systemd via the cgroup logic. It will then remove the PAM
1252          * session again. The parent process will exec() the actual
1253          * daemon. We do things this way to ensure that the main PID
1254          * of the daemon is the one we initially fork()ed. */
1255
1256         r = barrier_create(&barrier);
1257         if (r < 0)
1258                 goto fail;
1259
1260         if (log_get_max_level() < LOG_DEBUG)
1261                 flags |= PAM_SILENT;
1262
1263         pam_code = pam_start(name, user, &conv, &handle);
1264         if (pam_code != PAM_SUCCESS) {
1265                 handle = NULL;
1266                 goto fail;
1267         }
1268
1269         if (!tty) {
1270                 _cleanup_free_ char *q = NULL;
1271
1272                 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1273                  * out if that's the case, and read the TTY off it. */
1274
1275                 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1276                         tty = strjoina("/dev/", q);
1277         }
1278
1279         if (tty) {
1280                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1281                 if (pam_code != PAM_SUCCESS)
1282                         goto fail;
1283         }
1284
1285         STRV_FOREACH(nv, *env) {
1286                 pam_code = pam_putenv(handle, *nv);
1287                 if (pam_code != PAM_SUCCESS)
1288                         goto fail;
1289         }
1290
1291         pam_code = pam_acct_mgmt(handle, flags);
1292         if (pam_code != PAM_SUCCESS)
1293                 goto fail;
1294
1295         pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1296         if (pam_code != PAM_SUCCESS)
1297                 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
1298
1299         pam_code = pam_open_session(handle, flags);
1300         if (pam_code != PAM_SUCCESS)
1301                 goto fail;
1302
1303         close_session = true;
1304
1305         e = pam_getenvlist(handle);
1306         if (!e) {
1307                 pam_code = PAM_BUF_ERR;
1308                 goto fail;
1309         }
1310
1311         /* Block SIGTERM, so that we know that it won't get lost in the child */
1312
1313         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1314
1315         parent_pid = getpid_cached();
1316
1317         r = safe_fork("(sd-pam)", 0, &pam_pid);
1318         if (r < 0)
1319                 goto fail;
1320         if (r == 0) {
1321                 int sig, ret = EXIT_PAM;
1322
1323                 /* The child's job is to reset the PAM session on termination */
1324                 barrier_set_role(&barrier, BARRIER_CHILD);
1325
1326                 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1327                  * those fds are open here that have been opened by PAM. */
1328                 (void) close_many(fds, n_fds);
1329
1330                 /* Drop privileges - we don't need any to pam_close_session and this will make
1331                  * PR_SET_PDEATHSIG work in most cases.  If this fails, ignore the error - but expect sd-pam
1332                  * threads to fail to exit normally */
1333
1334                 r = maybe_setgroups(0, NULL);
1335                 if (r < 0)
1336                         log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1337                 if (setresgid(gid, gid, gid) < 0)
1338                         log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1339                 if (setresuid(uid, uid, uid) < 0)
1340                         log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1341
1342                 (void) ignore_signals(SIGPIPE);
1343
1344                 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1345                  * otherwise the kernel will not allow unprivileged parents kill their privileged children
1346                  * this way. We rely on the control groups kill logic to do the rest for us. */
1347                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1348                         goto child_finish;
1349
1350                 /* Tell the parent that our setup is done. This is especially important regarding dropping
1351                  * privileges. Otherwise, unit setup might race against our setresuid(2) call.
1352                  *
1353                  * If the parent aborted, we'll detect this below, hence ignore return failure here. */
1354                 (void) barrier_place(&barrier);
1355
1356                 /* Check if our parent process might already have died? */
1357                 if (getppid() == parent_pid) {
1358                         sigset_t ss;
1359
1360                         assert_se(sigemptyset(&ss) >= 0);
1361                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1362
1363                         for (;;) {
1364                                 if (sigwait(&ss, &sig) < 0) {
1365                                         if (errno == EINTR)
1366                                                 continue;
1367
1368                                         goto child_finish;
1369                                 }
1370
1371                                 assert(sig == SIGTERM);
1372                                 break;
1373                         }
1374                 }
1375
1376                 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1377                 if (pam_code != PAM_SUCCESS)
1378                         goto child_finish;
1379
1380                 /* If our parent died we'll end the session */
1381                 if (getppid() != parent_pid) {
1382                         pam_code = pam_close_session(handle, flags);
1383                         if (pam_code != PAM_SUCCESS)
1384                                 goto child_finish;
1385                 }
1386
1387                 ret = 0;
1388
1389         child_finish:
1390                 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1391                  * know about this. See pam_end(3) */
1392                 (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
1393                 _exit(ret);
1394         }
1395
1396         barrier_set_role(&barrier, BARRIER_PARENT);
1397
1398         /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1399          * here. */
1400         handle = NULL;
1401
1402         /* Unblock SIGTERM again in the parent */
1403         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1404
1405         /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1406          * this fd around. */
1407         closelog();
1408
1409         /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1410          * recover. However, warn loudly if it happens. */
1411         if (!barrier_place_and_sync(&barrier))
1412                 log_error("PAM initialization failed");
1413
1414         return strv_free_and_replace(*env, e);
1415
1416 fail:
1417         if (pam_code != PAM_SUCCESS) {
1418                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1419                 r = -EPERM;  /* PAM errors do not map to errno */
1420         } else
1421                 log_error_errno(r, "PAM failed: %m");
1422
1423         if (handle) {
1424                 if (close_session)
1425                         pam_code = pam_close_session(handle, flags);
1426
1427                 (void) pam_end(handle, pam_code | flags);
1428         }
1429
1430         closelog();
1431         return r;
1432 #else
1433         return 0;
1434 #endif
1435 }
1436
1437 static void rename_process_from_path(const char *path) {
1438         _cleanup_free_ char *buf = NULL;
1439         const char *p;
1440
1441         assert(path);
1442
1443         /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
1444          * /bin/ps */
1445
1446         if (path_extract_filename(path, &buf) < 0) {
1447                 rename_process("(...)");
1448                 return;
1449         }
1450
1451         size_t l = strlen(buf);
1452         if (l > 8) {
1453                 /* The end of the process name is usually more interesting, since the first bit might just be
1454                  * "systemd-" */
1455                 p = buf + l - 8;
1456                 l = 8;
1457         } else
1458                 p = buf;
1459
1460         char process_name[11];
1461         process_name[0] = '(';
1462         memcpy(process_name+1, p, l);
1463         process_name[1+l] = ')';
1464         process_name[1+l+1] = 0;
1465
1466         rename_process(process_name);
1467 }
1468
1469 static bool context_has_address_families(const ExecContext *c) {
1470         assert(c);
1471
1472         return c->address_families_allow_list ||
1473                 !set_isempty(c->address_families);
1474 }
1475
1476 static bool context_has_syscall_filters(const ExecContext *c) {
1477         assert(c);
1478
1479         return c->syscall_allow_list ||
1480                 !hashmap_isempty(c->syscall_filter);
1481 }
1482
1483 static bool context_has_syscall_logs(const ExecContext *c) {
1484         assert(c);
1485
1486         return c->syscall_log_allow_list ||
1487                 !hashmap_isempty(c->syscall_log);
1488 }
1489
1490 static bool context_has_no_new_privileges(const ExecContext *c) {
1491         assert(c);
1492
1493         if (c->no_new_privileges)
1494                 return true;
1495
1496         if (have_effective_cap(CAP_SYS_ADMIN) > 0) /* if we are privileged, we don't need NNP */
1497                 return false;
1498
1499         /* We need NNP if we have any form of seccomp and are unprivileged */
1500         return c->lock_personality ||
1501                 c->memory_deny_write_execute ||
1502                 c->private_devices ||
1503                 c->protect_clock ||
1504                 c->protect_hostname ||
1505                 c->protect_kernel_tunables ||
1506                 c->protect_kernel_modules ||
1507                 c->protect_kernel_logs ||
1508                 context_has_address_families(c) ||
1509                 exec_context_restrict_namespaces_set(c) ||
1510                 c->restrict_realtime ||
1511                 c->restrict_suid_sgid ||
1512                 !set_isempty(c->syscall_archs) ||
1513                 context_has_syscall_filters(c) ||
1514                 context_has_syscall_logs(c);
1515 }
1516
1517 bool exec_context_has_credentials(const ExecContext *context) {
1518
1519         assert(context);
1520
1521         return !hashmap_isempty(context->set_credentials) ||
1522                 !hashmap_isempty(context->load_credentials) ||
1523                 !set_isempty(context->import_credentials);
1524 }
1525
1526 #if HAVE_SECCOMP
1527
1528 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1529
1530         if (is_seccomp_available())
1531                 return false;
1532
1533         log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1534         return true;
1535 }
1536
1537 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1538         uint32_t negative_action, default_action, action;
1539         int r;
1540
1541         assert(u);
1542         assert(c);
1543
1544         if (!context_has_syscall_filters(c))
1545                 return 0;
1546
1547         if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1548                 return 0;
1549
1550         negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1551
1552         if (c->syscall_allow_list) {
1553                 default_action = negative_action;
1554                 action = SCMP_ACT_ALLOW;
1555         } else {
1556                 default_action = SCMP_ACT_ALLOW;
1557                 action = negative_action;
1558         }
1559
1560         if (needs_ambient_hack) {
1561                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1562                 if (r < 0)
1563                         return r;
1564         }
1565
1566         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1567 }
1568
1569 static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1570 #ifdef SCMP_ACT_LOG
1571         uint32_t default_action, action;
1572 #endif
1573
1574         assert(u);
1575         assert(c);
1576
1577         if (!context_has_syscall_logs(c))
1578                 return 0;
1579
1580 #ifdef SCMP_ACT_LOG
1581         if (skip_seccomp_unavailable(u, "SystemCallLog="))
1582                 return 0;
1583
1584         if (c->syscall_log_allow_list) {
1585                 /* Log nothing but the ones listed */
1586                 default_action = SCMP_ACT_ALLOW;
1587                 action = SCMP_ACT_LOG;
1588         } else {
1589                 /* Log everything but the ones listed */
1590                 default_action = SCMP_ACT_LOG;
1591                 action = SCMP_ACT_ALLOW;
1592         }
1593
1594         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1595 #else
1596         /* old libseccomp */
1597         log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1598         return 0;
1599 #endif
1600 }
1601
1602 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1603         assert(u);
1604         assert(c);
1605
1606         if (set_isempty(c->syscall_archs))
1607                 return 0;
1608
1609         if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1610                 return 0;
1611
1612         return seccomp_restrict_archs(c->syscall_archs);
1613 }
1614
1615 static int apply_address_families(const Unit* u, const ExecContext *c) {
1616         assert(u);
1617         assert(c);
1618
1619         if (!context_has_address_families(c))
1620                 return 0;
1621
1622         if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1623                 return 0;
1624
1625         return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1626 }
1627
1628 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1629         int r;
1630
1631         assert(u);
1632         assert(c);
1633
1634         if (!c->memory_deny_write_execute)
1635                 return 0;
1636
1637         /* use prctl() if kernel supports it (6.3) */
1638         r = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0, 0, 0);
1639         if (r == 0) {
1640                 log_unit_debug(u, "Enabled MemoryDenyWriteExecute= with PR_SET_MDWE");
1641                 return 0;
1642         }
1643         if (r < 0 && errno != EINVAL)
1644                 return log_unit_debug_errno(u, errno, "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m");
1645         /* else use seccomp */
1646         log_unit_debug(u, "Kernel doesn't support PR_SET_MDWE: falling back to seccomp");
1647
1648         if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1649                 return 0;
1650
1651         return seccomp_memory_deny_write_execute();
1652 }
1653
1654 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1655         assert(u);
1656         assert(c);
1657
1658         if (!c->restrict_realtime)
1659                 return 0;
1660
1661         if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1662                 return 0;
1663
1664         return seccomp_restrict_realtime();
1665 }
1666
1667 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1668         assert(u);
1669         assert(c);
1670
1671         if (!c->restrict_suid_sgid)
1672                 return 0;
1673
1674         if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1675                 return 0;
1676
1677         return seccomp_restrict_suid_sgid();
1678 }
1679
1680 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1681         assert(u);
1682         assert(c);
1683
1684         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1685          * let's protect even those systems where this is left on in the kernel. */
1686
1687         if (!c->protect_kernel_tunables)
1688                 return 0;
1689
1690         if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1691                 return 0;
1692
1693         return seccomp_protect_sysctl();
1694 }
1695
1696 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1697         assert(u);
1698         assert(c);
1699
1700         /* Turn off module syscalls on ProtectKernelModules=yes */
1701
1702         if (!c->protect_kernel_modules)
1703                 return 0;
1704
1705         if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1706                 return 0;
1707
1708         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1709 }
1710
1711 static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1712         assert(u);
1713         assert(c);
1714
1715         if (!c->protect_kernel_logs)
1716                 return 0;
1717
1718         if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1719                 return 0;
1720
1721         return seccomp_protect_syslog();
1722 }
1723
1724 static int apply_protect_clock(const Unit *u, const ExecContext *c) {
1725         assert(u);
1726         assert(c);
1727
1728         if (!c->protect_clock)
1729                 return 0;
1730
1731         if (skip_seccomp_unavailable(u, "ProtectClock="))
1732                 return 0;
1733
1734         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1735 }
1736
1737 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1738         assert(u);
1739         assert(c);
1740
1741         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1742
1743         if (!c->private_devices)
1744                 return 0;
1745
1746         if (skip_seccomp_unavailable(u, "PrivateDevices="))
1747                 return 0;
1748
1749         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1750 }
1751
1752 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1753         assert(u);
1754         assert(c);
1755
1756         if (!exec_context_restrict_namespaces_set(c))
1757                 return 0;
1758
1759         if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1760                 return 0;
1761
1762         return seccomp_restrict_namespaces(c->restrict_namespaces);
1763 }
1764
1765 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1766         unsigned long personality;
1767         int r;
1768
1769         assert(u);
1770         assert(c);
1771
1772         if (!c->lock_personality)
1773                 return 0;
1774
1775         if (skip_seccomp_unavailable(u, "LockPersonality="))
1776                 return 0;
1777
1778         personality = c->personality;
1779
1780         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1781         if (personality == PERSONALITY_INVALID) {
1782
1783                 r = opinionated_personality(&personality);
1784                 if (r < 0)
1785                         return r;
1786         }
1787
1788         return seccomp_lock_personality(personality);
1789 }
1790
1791 #endif
1792
1793 #if HAVE_LIBBPF
1794 static int apply_restrict_filesystems(Unit *u, const ExecContext *c) {
1795         assert(u);
1796         assert(c);
1797
1798         if (!exec_context_restrict_filesystems_set(c))
1799                 return 0;
1800
1801         if (!u->manager->restrict_fs) {
1802                 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1803                 log_unit_debug(u, "LSM BPF not supported, skipping RestrictFileSystems=");
1804                 return 0;
1805         }
1806
1807         return lsm_bpf_unit_restrict_filesystems(u, c->restrict_filesystems, c->restrict_filesystems_allow_list);
1808 }
1809 #endif
1810
1811 static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
1812         assert(u);
1813         assert(c);
1814
1815         if (!c->protect_hostname)
1816                 return 0;
1817
1818         if (ns_type_supported(NAMESPACE_UTS)) {
1819                 if (unshare(CLONE_NEWUTS) < 0) {
1820                         if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1821                                 *ret_exit_status = EXIT_NAMESPACE;
1822                                 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1823                         }
1824
1825                         log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1826                 }
1827         } else
1828                 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1829
1830 #if HAVE_SECCOMP
1831         int r;
1832
1833         if (skip_seccomp_unavailable(u, "ProtectHostname="))
1834                 return 0;
1835
1836         r = seccomp_protect_hostname();
1837         if (r < 0) {
1838                 *ret_exit_status = EXIT_SECCOMP;
1839                 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1840         }
1841 #endif
1842
1843         return 0;
1844 }
1845
1846 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1847         assert(idle_pipe);
1848
1849         idle_pipe[1] = safe_close(idle_pipe[1]);
1850         idle_pipe[2] = safe_close(idle_pipe[2]);
1851
1852         if (idle_pipe[0] >= 0) {
1853                 int r;
1854
1855                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1856
1857                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1858                         ssize_t n;
1859
1860                         /* Signal systemd that we are bored and want to continue. */
1861                         n = write(idle_pipe[3], "x", 1);
1862                         if (n > 0)
1863                                 /* Wait for systemd to react to the signal above. */
1864                                 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1865                 }
1866
1867                 idle_pipe[0] = safe_close(idle_pipe[0]);
1868
1869         }
1870
1871         idle_pipe[3] = safe_close(idle_pipe[3]);
1872 }
1873
1874 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1875
1876 static int build_environment(
1877                 const Unit *u,
1878                 const ExecContext *c,
1879                 const ExecParameters *p,
1880                 const CGroupContext *cgroup_context,
1881                 size_t n_fds,
1882                 char **fdnames,
1883                 const char *home,
1884                 const char *username,
1885                 const char *shell,
1886                 dev_t journal_stream_dev,
1887                 ino_t journal_stream_ino,
1888                 const char *memory_pressure_path,
1889                 char ***ret) {
1890
1891         _cleanup_strv_free_ char **our_env = NULL;
1892         size_t n_env = 0;
1893         char *x;
1894         int r;
1895
1896         assert(u);
1897         assert(c);
1898         assert(p);
1899         assert(ret);
1900
1901 #define N_ENV_VARS 19
1902         our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1903         if (!our_env)
1904                 return -ENOMEM;
1905
1906         if (n_fds > 0) {
1907                 _cleanup_free_ char *joined = NULL;
1908
1909                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1910                         return -ENOMEM;
1911                 our_env[n_env++] = x;
1912
1913                 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1914                         return -ENOMEM;
1915                 our_env[n_env++] = x;
1916
1917                 joined = strv_join(fdnames, ":");
1918                 if (!joined)
1919                         return -ENOMEM;
1920
1921                 x = strjoin("LISTEN_FDNAMES=", joined);
1922                 if (!x)
1923                         return -ENOMEM;
1924                 our_env[n_env++] = x;
1925         }
1926
1927         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1928                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1929                         return -ENOMEM;
1930                 our_env[n_env++] = x;
1931
1932                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1933                         return -ENOMEM;
1934                 our_env[n_env++] = x;
1935         }
1936
1937         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1938          * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1939          * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1940         if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
1941                 x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
1942                 if (!x)
1943                         return -ENOMEM;
1944                 our_env[n_env++] = x;
1945         }
1946
1947         if (home) {
1948                 x = strjoin("HOME=", home);
1949                 if (!x)
1950                         return -ENOMEM;
1951
1952                 path_simplify(x + 5);
1953                 our_env[n_env++] = x;
1954         }
1955
1956         if (username) {
1957                 x = strjoin("LOGNAME=", username);
1958                 if (!x)
1959                         return -ENOMEM;
1960                 our_env[n_env++] = x;
1961
1962                 x = strjoin("USER=", username);
1963                 if (!x)
1964                         return -ENOMEM;
1965                 our_env[n_env++] = x;
1966         }
1967
1968         if (shell) {
1969                 x = strjoin("SHELL=", shell);
1970                 if (!x)
1971                         return -ENOMEM;
1972
1973                 path_simplify(x + 6);
1974                 our_env[n_env++] = x;
1975         }
1976
1977         if (!sd_id128_is_null(u->invocation_id)) {
1978                 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1979                         return -ENOMEM;
1980
1981                 our_env[n_env++] = x;
1982         }
1983
1984         if (exec_context_needs_term(c)) {
1985                 _cleanup_free_ char *cmdline = NULL;
1986                 const char *tty_path, *term = NULL;
1987
1988                 tty_path = exec_context_tty_path(c);
1989
1990                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1991                  * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1992                  * container manager passes to PID 1 ends up all the way in the console login shown. */
1993
1994                 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
1995                         term = getenv("TERM");
1996                 else if (tty_path && in_charset(skip_dev_prefix(tty_path), ALPHANUMERICAL)) {
1997                         _cleanup_free_ char *key = NULL;
1998
1999                         key = strjoin("systemd.tty.term.", skip_dev_prefix(tty_path));
2000                         if (!key)
2001                                 return -ENOMEM;
2002
2003                         r = proc_cmdline_get_key(key, 0, &cmdline);
2004                         if (r < 0)
2005                                 log_debug_errno(r, "Failed to read %s from kernel cmdline, ignoring: %m", key);
2006                         else if (r > 0)
2007                                 term = cmdline;
2008                 }
2009
2010                 if (!term)
2011                         term = default_term_for_tty(tty_path);
2012
2013                 x = strjoin("TERM=", term);
2014                 if (!x)
2015                         return -ENOMEM;
2016                 our_env[n_env++] = x;
2017         }
2018
2019         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
2020                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
2021                         return -ENOMEM;
2022
2023                 our_env[n_env++] = x;
2024         }
2025
2026         if (c->log_namespace) {
2027                 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
2028                 if (!x)
2029                         return -ENOMEM;
2030
2031                 our_env[n_env++] = x;
2032         }
2033
2034         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2035                 _cleanup_free_ char *joined = NULL;
2036                 const char *n;
2037
2038                 if (!p->prefix[t])
2039                         continue;
2040
2041                 if (c->directories[t].n_items == 0)
2042                         continue;
2043
2044                 n = exec_directory_env_name_to_string(t);
2045                 if (!n)
2046                         continue;
2047
2048                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
2049                         _cleanup_free_ char *prefixed = NULL;
2050
2051                         prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
2052                         if (!prefixed)
2053                                 return -ENOMEM;
2054
2055                         if (!strextend_with_separator(&joined, ":", prefixed))
2056                                 return -ENOMEM;
2057                 }
2058
2059                 x = strjoin(n, "=", joined);
2060                 if (!x)
2061                         return -ENOMEM;
2062
2063                 our_env[n_env++] = x;
2064         }
2065
2066         if (exec_context_has_credentials(c) && p->prefix[EXEC_DIRECTORY_RUNTIME]) {
2067                 x = strjoin("CREDENTIALS_DIRECTORY=", p->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
2068                 if (!x)
2069                         return -ENOMEM;
2070
2071                 our_env[n_env++] = x;
2072         }
2073
2074         if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
2075                 return -ENOMEM;
2076
2077         our_env[n_env++] = x;
2078
2079         if (memory_pressure_path) {
2080                 x = strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path);
2081                 if (!x)
2082                         return -ENOMEM;
2083
2084                 our_env[n_env++] = x;
2085
2086                 if (cgroup_context && !path_equal(memory_pressure_path, "/dev/null")) {
2087                         _cleanup_free_ char *b = NULL, *e = NULL;
2088
2089                         if (asprintf(&b, "%s " USEC_FMT " " USEC_FMT,
2090                                      MEMORY_PRESSURE_DEFAULT_TYPE,
2091                                      cgroup_context->memory_pressure_threshold_usec == USEC_INFINITY ? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC :
2092                                      CLAMP(cgroup_context->memory_pressure_threshold_usec, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC),
2093                                      MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2094                                 return -ENOMEM;
2095
2096                         if (base64mem(b, strlen(b) + 1, &e) < 0)
2097                                 return -ENOMEM;
2098
2099                         x = strjoin("MEMORY_PRESSURE_WRITE=", e);
2100                         if (!x)
2101                                 return -ENOMEM;
2102
2103                         our_env[n_env++] = x;
2104                 }
2105         }
2106
2107         assert(n_env < N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
2108 #undef N_ENV_VARS
2109
2110         *ret = TAKE_PTR(our_env);
2111
2112         return 0;
2113 }
2114
2115 static int build_pass_environment(const ExecContext *c, char ***ret) {
2116         _cleanup_strv_free_ char **pass_env = NULL;
2117         size_t n_env = 0;
2118
2119         STRV_FOREACH(i, c->pass_environment) {
2120                 _cleanup_free_ char *x = NULL;
2121                 char *v;
2122
2123                 v = getenv(*i);
2124                 if (!v)
2125                         continue;
2126                 x = strjoin(*i, "=", v);
2127                 if (!x)
2128                         return -ENOMEM;
2129
2130                 if (!GREEDY_REALLOC(pass_env, n_env + 2))
2131                         return -ENOMEM;
2132
2133                 pass_env[n_env++] = TAKE_PTR(x);
2134                 pass_env[n_env] = NULL;
2135         }
2136
2137         *ret = TAKE_PTR(pass_env);
2138
2139         return 0;
2140 }
2141
2142 bool exec_needs_network_namespace(const ExecContext *context) {
2143         assert(context);
2144
2145         return context->private_network || context->network_namespace_path;
2146 }
2147
2148 static bool exec_needs_ephemeral(const ExecContext *context) {
2149         return (context->root_image || context->root_directory) && context->root_ephemeral;
2150 }
2151
2152 static bool exec_needs_ipc_namespace(const ExecContext *context) {
2153         assert(context);
2154
2155         return context->private_ipc || context->ipc_namespace_path;
2156 }
2157
2158 bool exec_needs_mount_namespace(
2159                 const ExecContext *context,
2160                 const ExecParameters *params,
2161                 const ExecRuntime *runtime) {
2162
2163         assert(context);
2164
2165         if (context->root_image)
2166                 return true;
2167
2168         if (!strv_isempty(context->read_write_paths) ||
2169             !strv_isempty(context->read_only_paths) ||
2170             !strv_isempty(context->inaccessible_paths) ||
2171             !strv_isempty(context->exec_paths) ||
2172             !strv_isempty(context->no_exec_paths))
2173                 return true;
2174
2175         if (context->n_bind_mounts > 0)
2176                 return true;
2177
2178         if (context->n_temporary_filesystems > 0)
2179                 return true;
2180
2181         if (context->n_mount_images > 0)
2182                 return true;
2183
2184         if (context->n_extension_images > 0)
2185                 return true;
2186
2187         if (!strv_isempty(context->extension_directories))
2188                 return true;
2189
2190         if (!IN_SET(context->mount_propagation_flag, 0, MS_SHARED))
2191                 return true;
2192
2193         if (context->private_tmp && runtime && runtime->shared && (runtime->shared->tmp_dir || runtime->shared->var_tmp_dir))
2194                 return true;
2195
2196         if (context->private_devices ||
2197             context->private_mounts > 0 ||
2198             (context->private_mounts < 0 && exec_needs_network_namespace(context)) ||
2199             context->protect_system != PROTECT_SYSTEM_NO ||
2200             context->protect_home != PROTECT_HOME_NO ||
2201             context->protect_kernel_tunables ||
2202             context->protect_kernel_modules ||
2203             context->protect_kernel_logs ||
2204             context->protect_control_groups ||
2205             context->protect_proc != PROTECT_PROC_DEFAULT ||
2206             context->proc_subset != PROC_SUBSET_ALL ||
2207             exec_needs_ipc_namespace(context))
2208                 return true;
2209
2210         if (context->root_directory) {
2211                 if (exec_context_get_effective_mount_apivfs(context))
2212                         return true;
2213
2214                 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2215                         if (params && !params->prefix[t])
2216                                 continue;
2217
2218                         if (context->directories[t].n_items > 0)
2219                                 return true;
2220                 }
2221         }
2222
2223         if (context->dynamic_user &&
2224             (context->directories[EXEC_DIRECTORY_STATE].n_items > 0 ||
2225              context->directories[EXEC_DIRECTORY_CACHE].n_items > 0 ||
2226              context->directories[EXEC_DIRECTORY_LOGS].n_items > 0))
2227                 return true;
2228
2229         if (context->log_namespace)
2230                 return true;
2231
2232         return false;
2233 }
2234
2235 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
2236         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2237         _cleanup_close_pair_ int errno_pipe[2] = PIPE_EBADF;
2238         _cleanup_close_ int unshare_ready_fd = -EBADF;
2239         _cleanup_(sigkill_waitp) pid_t pid = 0;
2240         uint64_t c = 1;
2241         ssize_t n;
2242         int r;
2243
2244         /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2245          * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2246          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2247          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2248          * which waits for the parent to create the new user namespace while staying in the original namespace. The
2249          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2250          * continues execution normally.
2251          * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2252          * does not need CAP_SETUID to write the single line mapping to itself. */
2253
2254         /* Can only set up multiple mappings with CAP_SETUID. */
2255         if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid))
2256                 r = asprintf(&uid_map,
2257                              UID_FMT " " UID_FMT " 1\n"     /* Map $OUID → $OUID */
2258                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
2259                              ouid, ouid, uid, uid);
2260         else
2261                 r = asprintf(&uid_map,
2262                              UID_FMT " " UID_FMT " 1\n",    /* Map $OUID → $OUID */
2263                              ouid, ouid);
2264
2265         if (r < 0)
2266                 return -ENOMEM;
2267
2268         /* Can only set up multiple mappings with CAP_SETGID. */
2269         if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid))
2270                 r = asprintf(&gid_map,
2271                              GID_FMT " " GID_FMT " 1\n"     /* Map $OGID → $OGID */
2272                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
2273                              ogid, ogid, gid, gid);
2274         else
2275                 r = asprintf(&gid_map,
2276                              GID_FMT " " GID_FMT " 1\n",    /* Map $OGID -> $OGID */
2277                              ogid, ogid);
2278
2279         if (r < 0)
2280                 return -ENOMEM;
2281
2282         /* Create a communication channel so that the parent can tell the child when it finished creating the user
2283          * namespace. */
2284         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2285         if (unshare_ready_fd < 0)
2286                 return -errno;
2287
2288         /* Create a communication channel so that the child can tell the parent a proper error code in case it
2289          * failed. */
2290         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2291                 return -errno;
2292
2293         r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2294         if (r < 0)
2295                 return r;
2296         if (r == 0) {
2297                 _cleanup_close_ int fd = -EBADF;
2298                 const char *a;
2299                 pid_t ppid;
2300
2301                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2302                  * here, after the parent opened its own user namespace. */
2303
2304                 ppid = getppid();
2305                 errno_pipe[0] = safe_close(errno_pipe[0]);
2306
2307                 /* Wait until the parent unshared the user namespace */
2308                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2309                         r = -errno;
2310                         goto child_fail;
2311                 }
2312
2313                 /* Disable the setgroups() system call in the child user namespace, for good. */
2314                 a = procfs_file_alloca(ppid, "setgroups");
2315                 fd = open(a, O_WRONLY|O_CLOEXEC);
2316                 if (fd < 0) {
2317                         if (errno != ENOENT) {
2318                                 r = -errno;
2319                                 goto child_fail;
2320                         }
2321
2322                         /* If the file is missing the kernel is too old, let's continue anyway. */
2323                 } else {
2324                         if (write(fd, "deny\n", 5) < 0) {
2325                                 r = -errno;
2326                                 goto child_fail;
2327                         }
2328
2329                         fd = safe_close(fd);
2330                 }
2331
2332                 /* First write the GID map */
2333                 a = procfs_file_alloca(ppid, "gid_map");
2334                 fd = open(a, O_WRONLY|O_CLOEXEC);
2335                 if (fd < 0) {
2336                         r = -errno;
2337                         goto child_fail;
2338                 }
2339                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2340                         r = -errno;
2341                         goto child_fail;
2342                 }
2343                 fd = safe_close(fd);
2344
2345                 /* The write the UID map */
2346                 a = procfs_file_alloca(ppid, "uid_map");
2347                 fd = open(a, O_WRONLY|O_CLOEXEC);
2348                 if (fd < 0) {
2349                         r = -errno;
2350                         goto child_fail;
2351                 }
2352                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2353                         r = -errno;
2354                         goto child_fail;
2355                 }
2356
2357                 _exit(EXIT_SUCCESS);
2358
2359         child_fail:
2360                 (void) write(errno_pipe[1], &r, sizeof(r));
2361                 _exit(EXIT_FAILURE);
2362         }
2363
2364         errno_pipe[1] = safe_close(errno_pipe[1]);
2365
2366         if (unshare(CLONE_NEWUSER) < 0)
2367                 return -errno;
2368
2369         /* Let the child know that the namespace is ready now */
2370         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2371                 return -errno;
2372
2373         /* Try to read an error code from the child */
2374         n = read(errno_pipe[0], &r, sizeof(r));
2375         if (n < 0)
2376                 return -errno;
2377         if (n == sizeof(r)) { /* an error code was sent to us */
2378                 if (r < 0)
2379                         return r;
2380                 return -EIO;
2381         }
2382         if (n != 0) /* on success we should have read 0 bytes */
2383                 return -EIO;
2384
2385         r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
2386         if (r < 0)
2387                 return r;
2388         if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2389                 return -EIO;
2390
2391         return 0;
2392 }
2393
2394 static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2395         assert(context);
2396
2397         if (!context->dynamic_user)
2398                 return false;
2399
2400         if (type == EXEC_DIRECTORY_CONFIGURATION)
2401                 return false;
2402
2403         if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2404                 return false;
2405
2406         return true;
2407 }
2408
2409 static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2410         _cleanup_free_ char *src_abs = NULL;
2411         int r;
2412
2413         assert(source);
2414
2415         src_abs = path_join(root, source);
2416         if (!src_abs)
2417                 return -ENOMEM;
2418
2419         STRV_FOREACH(dst, symlinks) {
2420                 _cleanup_free_ char *dst_abs = NULL;
2421
2422                 dst_abs = path_join(root, *dst);
2423                 if (!dst_abs)
2424                         return -ENOMEM;
2425
2426                 r = mkdir_parents_label(dst_abs, 0755);
2427                 if (r < 0)
2428                         return r;
2429
2430                 r = symlink_idempotent(src_abs, dst_abs, true);
2431                 if (r < 0)
2432                         return r;
2433         }
2434
2435         return 0;
2436 }
2437
2438 static int setup_exec_directory(
2439                 Unit *u,
2440                 const ExecContext *context,
2441                 const ExecParameters *params,
2442                 uid_t uid,
2443                 gid_t gid,
2444                 ExecDirectoryType type,
2445                 bool needs_mount_namespace,
2446                 int *exit_status) {
2447
2448         static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2449                 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2450                 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2451                 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2452                 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2453                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2454         };
2455         int r;
2456
2457         assert(context);
2458         assert(params);
2459         assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2460         assert(exit_status);
2461
2462         if (!params->prefix[type])
2463                 return 0;
2464
2465         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2466                 if (!uid_is_valid(uid))
2467                         uid = 0;
2468                 if (!gid_is_valid(gid))
2469                         gid = 0;
2470         }
2471
2472         for (size_t i = 0; i < context->directories[type].n_items; i++) {
2473                 _cleanup_free_ char *p = NULL, *pp = NULL;
2474
2475                 p = path_join(params->prefix[type], context->directories[type].items[i].path);
2476                 if (!p) {
2477                         r = -ENOMEM;
2478                         goto fail;
2479                 }
2480
2481                 r = mkdir_parents_label(p, 0755);
2482                 if (r < 0)
2483                         goto fail;
2484
2485                 if (IN_SET(type, EXEC_DIRECTORY_STATE, EXEC_DIRECTORY_LOGS) && params->runtime_scope == RUNTIME_SCOPE_USER) {
2486
2487                         /* If we are in user mode, and a configuration directory exists but a state directory
2488                          * doesn't exist, then we likely are upgrading from an older systemd version that
2489                          * didn't know the more recent addition to the xdg-basedir spec: the $XDG_STATE_HOME
2490                          * directory. In older systemd versions EXEC_DIRECTORY_STATE was aliased to
2491                          * EXEC_DIRECTORY_CONFIGURATION, with the advent of $XDG_STATE_HOME is is now
2492                          * separated. If a service has both dirs configured but only the configuration dir
2493                          * exists and the state dir does not, we assume we are looking at an update
2494                          * situation. Hence, create a compatibility symlink, so that all expectations are
2495                          * met.
2496                          *
2497                          * (We also do something similar with the log directory, which still doesn't exist in
2498                          * the xdg basedir spec. We'll make it a subdir of the state dir.) */
2499
2500                         /* this assumes the state dir is always created before the configuration dir */
2501                         assert_cc(EXEC_DIRECTORY_STATE < EXEC_DIRECTORY_LOGS);
2502                         assert_cc(EXEC_DIRECTORY_LOGS < EXEC_DIRECTORY_CONFIGURATION);
2503
2504                         r = laccess(p, F_OK);
2505                         if (r == -ENOENT) {
2506                                 _cleanup_free_ char *q = NULL;
2507
2508                                 /* OK, we know that the state dir does not exist. Let's see if the dir exists
2509                                  * under the configuration hierarchy. */
2510
2511                                 if (type == EXEC_DIRECTORY_STATE)
2512                                         q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], context->directories[type].items[i].path);
2513                                 else if (type == EXEC_DIRECTORY_LOGS)
2514                                         q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], "log", context->directories[type].items[i].path);
2515                                 else
2516                                         assert_not_reached();
2517                                 if (!q) {
2518                                         r = -ENOMEM;
2519                                         goto fail;
2520                                 }
2521
2522                                 r = laccess(q, F_OK);
2523                                 if (r >= 0) {
2524                                         /* It does exist! This hence looks like an update. Symlink the
2525                                          * configuration directory into the state directory. */
2526
2527                                         r = symlink_idempotent(q, p, /* make_relative= */ true);
2528                                         if (r < 0)
2529                                                 goto fail;
2530
2531                                         log_unit_notice(u, "Unit state directory %s missing but matching configuration directory %s exists, assuming update from systemd 253 or older, creating compatibility symlink.", p, q);
2532                                         continue;
2533                                 } else if (r != -ENOENT)
2534                                         log_unit_warning_errno(u, r, "Unable to detect whether unit configuration directory '%s' exists, assuming not: %m", q);
2535
2536                         } else if (r < 0)
2537                                 log_unit_warning_errno(u, r, "Unable to detect whether unit state directory '%s' is missing, assuming it is: %m", p);
2538                 }
2539
2540                 if (exec_directory_is_private(context, type)) {
2541                         /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2542                          * case we want to avoid leaving a directory around fully accessible that is owned by
2543                          * a dynamic user whose UID is later on reused. To lock this down we use the same
2544                          * trick used by container managers to prohibit host users to get access to files of
2545                          * the same UID in containers: we place everything inside a directory that has an
2546                          * access mode of 0700 and is owned root:root, so that it acts as security boundary
2547                          * for unprivileged host code. We then use fs namespacing to make this directory
2548                          * permeable for the service itself.
2549                          *
2550                          * Specifically: for a service which wants a special directory "foo/" we first create
2551                          * a directory "private/" with access mode 0700 owned by root:root. Then we place
2552                          * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2553                          * "private/foo". This way, privileged host users can access "foo/" as usual, but
2554                          * unprivileged host users can't look into it. Inside of the namespace of the unit
2555                          * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2556                          * "private/foo/" is mounted under the same name, thus disabling the access boundary
2557                          * for the service and making sure it only gets access to the dirs it needs but no
2558                          * others. Tricky? Yes, absolutely, but it works!
2559                          *
2560                          * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2561                          * to be owned by the service itself.
2562                          *
2563                          * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2564                          * for sharing files or sockets with other services. */
2565
2566                         pp = path_join(params->prefix[type], "private");
2567                         if (!pp) {
2568                                 r = -ENOMEM;
2569                                 goto fail;
2570                         }
2571
2572                         /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2573                         r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
2574                         if (r < 0)
2575                                 goto fail;
2576
2577                         if (!path_extend(&pp, context->directories[type].items[i].path)) {
2578                                 r = -ENOMEM;
2579                                 goto fail;
2580                         }
2581
2582                         /* Create all directories between the configured directory and this private root, and mark them 0755 */
2583                         r = mkdir_parents_label(pp, 0755);
2584                         if (r < 0)
2585                                 goto fail;
2586
2587                         if (is_dir(p, false) > 0 &&
2588                             (laccess(pp, F_OK) == -ENOENT)) {
2589
2590                                 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2591                                  * it over. Most likely the service has been upgraded from one that didn't use
2592                                  * DynamicUser=1, to one that does. */
2593
2594                                 log_unit_info(u, "Found pre-existing public %s= directory %s, migrating to %s.\n"
2595                                               "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2596                                               exec_directory_type_to_string(type), p, pp);
2597
2598                                 r = RET_NERRNO(rename(p, pp));
2599                                 if (r < 0)
2600                                         goto fail;
2601                         } else {
2602                                 /* Otherwise, create the actual directory for the service */
2603
2604                                 r = mkdir_label(pp, context->directories[type].mode);
2605                                 if (r < 0 && r != -EEXIST)
2606                                         goto fail;
2607                         }
2608
2609                         if (!context->directories[type].items[i].only_create) {
2610                                 /* And link it up from the original place.
2611                                  * Notes
2612                                  * 1) If a mount namespace is going to be used, then this symlink remains on
2613                                  *    the host, and a new one for the child namespace will be created later.
2614                                  * 2) It is not necessary to create this symlink when one of its parent
2615                                  *    directories is specified and already created. E.g.
2616                                  *        StateDirectory=foo foo/bar
2617                                  *    In that case, the inode points to pp and p for "foo/bar" are the same:
2618                                  *        pp = "/var/lib/private/foo/bar"
2619                                  *        p = "/var/lib/foo/bar"
2620                                  *    and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2621                                  *    we do not need to create the symlink, but we cannot create the symlink.
2622                                  *    See issue #24783. */
2623                                 r = symlink_idempotent(pp, p, true);
2624                                 if (r < 0)
2625                                         goto fail;
2626                         }
2627
2628                 } else {
2629                         _cleanup_free_ char *target = NULL;
2630
2631                         if (type != EXEC_DIRECTORY_CONFIGURATION &&
2632                             readlink_and_make_absolute(p, &target) >= 0) {
2633                                 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2634
2635                                 /* This already exists and is a symlink? Interesting. Maybe it's one created
2636                                  * by DynamicUser=1 (see above)?
2637                                  *
2638                                  * We do this for all directory types except for ConfigurationDirectory=,
2639                                  * since they all support the private/ symlink logic at least in some
2640                                  * configurations, see above. */
2641
2642                                 r = chase(target, NULL, 0, &target_resolved, NULL);
2643                                 if (r < 0)
2644                                         goto fail;
2645
2646                                 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
2647                                 if (!q) {
2648                                         r = -ENOMEM;
2649                                         goto fail;
2650                                 }
2651
2652                                 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2653                                 r = chase(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2654                                 if (r < 0)
2655                                         goto fail;
2656
2657                                 if (path_equal(q_resolved, target_resolved)) {
2658
2659                                         /* Hmm, apparently DynamicUser= was once turned on for this service,
2660                                          * but is no longer. Let's move the directory back up. */
2661
2662                                         log_unit_info(u, "Found pre-existing private %s= directory %s, migrating to %s.\n"
2663                                                       "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2664                                                       exec_directory_type_to_string(type), q, p);
2665
2666                                         r = RET_NERRNO(unlink(p));
2667                                         if (r < 0)
2668                                                 goto fail;
2669
2670                                         r = RET_NERRNO(rename(q, p));
2671                                         if (r < 0)
2672                                                 goto fail;
2673                                 }
2674                         }
2675
2676                         r = mkdir_label(p, context->directories[type].mode);
2677                         if (r < 0) {
2678                                 if (r != -EEXIST)
2679                                         goto fail;
2680
2681                                 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2682                                         struct stat st;
2683
2684                                         /* Don't change the owner/access mode of the configuration directory,
2685                                          * as in the common case it is not written to by a service, and shall
2686                                          * not be writable. */
2687
2688                                         r = RET_NERRNO(stat(p, &st));
2689                                         if (r < 0)
2690                                                 goto fail;
2691
2692                                         /* Still complain if the access mode doesn't match */
2693                                         if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2694                                                 log_unit_warning(u, "%s \'%s\' already exists but the mode is different. "
2695                                                                  "(File system: %o %sMode: %o)",
2696                                                                  exec_directory_type_to_string(type), context->directories[type].items[i].path,
2697                                                                  st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2698
2699                                         continue;
2700                                 }
2701                         }
2702                 }
2703
2704                 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2705                  * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2706                  * current UID/GID ownership.) */
2707                 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2708                 if (r < 0)
2709                         goto fail;
2710
2711                 /* Skip the rest (which deals with ownership) in user mode, since ownership changes are not
2712                  * available to user code anyway */
2713                 if (params->runtime_scope != RUNTIME_SCOPE_SYSTEM)
2714                         continue;
2715
2716                 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2717                  * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2718                  * assignments to exist. */
2719                 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777, AT_SYMLINK_FOLLOW);
2720                 if (r < 0)
2721                         goto fail;
2722         }
2723
2724         /* If we are not going to run in a namespace, set up the symlinks - otherwise
2725          * they are set up later, to allow configuring empty var/run/etc. */
2726         if (!needs_mount_namespace)
2727                 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2728                         r = create_many_symlinks(params->prefix[type],
2729                                                  context->directories[type].items[i].path,
2730                                                  context->directories[type].items[i].symlinks);
2731                         if (r < 0)
2732                                 goto fail;
2733                 }
2734
2735         return 0;
2736
2737 fail:
2738         *exit_status = exit_status_table[type];
2739         return r;
2740 }
2741
2742 static int write_credential(
2743                 int dfd,
2744                 const char *id,
2745                 const void *data,
2746                 size_t size,
2747                 uid_t uid,
2748                 bool ownership_ok) {
2749
2750         _cleanup_(unlink_and_freep) char *tmp = NULL;
2751         _cleanup_close_ int fd = -EBADF;
2752         int r;
2753
2754         r = tempfn_random_child("", "cred", &tmp);
2755         if (r < 0)
2756                 return r;
2757
2758         fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
2759         if (fd < 0) {
2760                 tmp = mfree(tmp);
2761                 return -errno;
2762         }
2763
2764         r = loop_write(fd, data, size, /* do_poll = */ false);
2765         if (r < 0)
2766                 return r;
2767
2768         if (fchmod(fd, 0400) < 0) /* Take away "w" bit */
2769                 return -errno;
2770
2771         if (uid_is_valid(uid) && uid != getuid()) {
2772                 r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
2773                 if (r < 0) {
2774                         if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2775                                 return r;
2776
2777                         if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want
2778                                             * to express: that the user gets read access and nothing
2779                                             * else. But if the backing fs can't support that (e.g. ramfs)
2780                                             * then we can use file ownership instead. But that's only safe if
2781                                             * we can then re-mount the whole thing read-only, so that the
2782                                             * user can no longer chmod() the file to gain write access. */
2783                                 return r;
2784
2785                         if (fchown(fd, uid, GID_INVALID) < 0)
2786                                 return -errno;
2787                 }
2788         }
2789
2790         if (renameat(dfd, tmp, dfd, id) < 0)
2791                 return -errno;
2792
2793         tmp = mfree(tmp);
2794         return 0;
2795 }
2796
2797 typedef enum CredentialSearchPath {
2798         CREDENTIAL_SEARCH_PATH_TRUSTED,
2799         CREDENTIAL_SEARCH_PATH_ENCRYPTED,
2800         CREDENTIAL_SEARCH_PATH_ALL,
2801         _CREDENTIAL_SEARCH_PATH_MAX,
2802         _CREDENTIAL_SEARCH_PATH_INVALID = -EINVAL,
2803 } CredentialSearchPath;
2804
2805 static char **credential_search_path(const ExecParameters *params, CredentialSearchPath path) {
2806
2807         _cleanup_strv_free_ char **l = NULL;
2808
2809         assert(params);
2810         assert(path >= 0 && path < _CREDENTIAL_SEARCH_PATH_MAX);
2811
2812         /* Assemble a search path to find credentials in. For non-encrypted credentials, We'll look in
2813          * /etc/credstore/ (and similar directories in /usr/lib/ + /run/). If we're looking for encrypted
2814          * credentials, we'll look in /etc/credstore.encrypted/ (and similar dirs). */
2815
2816         if (IN_SET(path, CREDENTIAL_SEARCH_PATH_ENCRYPTED, CREDENTIAL_SEARCH_PATH_ALL)) {
2817                 if (strv_extend(&l, params->received_encrypted_credentials_directory) < 0)
2818                         return NULL;
2819
2820                 if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore.encrypted"), /* filter_duplicates= */ true) < 0)
2821                         return NULL;
2822         }
2823
2824         if (IN_SET(path, CREDENTIAL_SEARCH_PATH_TRUSTED, CREDENTIAL_SEARCH_PATH_ALL)) {
2825                 if (params->received_credentials_directory)
2826                         if (strv_extend(&l, params->received_credentials_directory) < 0)
2827                                 return NULL;
2828
2829                 if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore"), /* filter_duplicates= */ true) < 0)
2830                         return NULL;
2831         }
2832
2833         if (DEBUG_LOGGING) {
2834                 _cleanup_free_ char *t = strv_join(l, ":");
2835
2836                 log_debug("Credential search path is: %s", strempty(t));
2837         }
2838
2839         return TAKE_PTR(l);
2840 }
2841
2842 static int maybe_decrypt_and_write_credential(
2843                 int dir_fd,
2844                 const char *id,
2845                 bool encrypted,
2846                 uid_t uid,
2847                 bool ownership_ok,
2848                 const char *data,
2849                 size_t size,
2850                 uint64_t *left) {
2851
2852         _cleanup_free_ void *plaintext = NULL;
2853         size_t add;
2854         int r;
2855
2856         if (encrypted) {
2857                 size_t plaintext_size = 0;
2858
2859                 r = decrypt_credential_and_warn(id, now(CLOCK_REALTIME), NULL, NULL, data, size,
2860                                                 &plaintext, &plaintext_size);
2861                 if (r < 0)
2862                         return r;
2863
2864                 data = plaintext;
2865                 size = plaintext_size;
2866         }
2867
2868         add = strlen(id) + size;
2869         if (add > *left)
2870                 return -E2BIG;
2871
2872         r = write_credential(dir_fd, id, data, size, uid, ownership_ok);
2873         if (r < 0)
2874                 return log_debug_errno(r, "Failed to write credential '%s': %m", id);
2875
2876         *left -= add;
2877         return 0;
2878 }
2879
2880 static int load_credential_glob(
2881                 const char *path,
2882                 bool encrypted,
2883                 char **search_path,
2884                 ReadFullFileFlags flags,
2885                 int write_dfd,
2886                 uid_t uid,
2887                 bool ownership_ok,
2888                 uint64_t *left) {
2889
2890         int r;
2891
2892         STRV_FOREACH(d, search_path) {
2893                 _cleanup_globfree_ glob_t pglob = {};
2894                 _cleanup_free_ char *j = NULL;
2895
2896                 j = path_join(*d, path);
2897                 if (!j)
2898                         return -ENOMEM;
2899
2900                 r = safe_glob(j, 0, &pglob);
2901                 if (r == -ENOENT)
2902                         continue;
2903                 if (r < 0)
2904                         return r;
2905
2906                 for (size_t n = 0; n < pglob.gl_pathc; n++) {
2907                         _cleanup_free_ char *fn = NULL;
2908                         _cleanup_(erase_and_freep) char *data = NULL;
2909                         size_t size;
2910
2911                         /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
2912                         r = read_full_file_full(
2913                                 AT_FDCWD,
2914                                 pglob.gl_pathv[n],
2915                                 UINT64_MAX,
2916                                 encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX,
2917                                 flags,
2918                                 NULL,
2919                                 &data, &size);
2920                         if (r < 0)
2921                                 return log_debug_errno(r, "Failed to read credential '%s': %m",
2922                                                         pglob.gl_pathv[n]);
2923
2924                         r = path_extract_filename(pglob.gl_pathv[n], &fn);
2925                         if (r < 0)
2926                                 return log_debug_errno(r, "Failed to extract filename from '%s': %m",
2927                                                         pglob.gl_pathv[n]);
2928
2929                         r = maybe_decrypt_and_write_credential(
2930                                 write_dfd,
2931                                 fn,
2932                                 encrypted,
2933                                 uid,
2934                                 ownership_ok,
2935                                 data, size,
2936                                 left);
2937                         if (r == -EEXIST)
2938                                 continue;
2939                         if (r < 0)
2940                                 return r;
2941                 }
2942         }
2943
2944         return 0;
2945 }
2946
2947 static int load_credential(
2948                 const ExecContext *context,
2949                 const ExecParameters *params,
2950                 const char *id,
2951                 const char *path,
2952                 bool encrypted,
2953                 const char *unit,
2954                 int read_dfd,
2955                 int write_dfd,
2956                 uid_t uid,
2957                 bool ownership_ok,
2958                 uint64_t *left) {
2959
2960         ReadFullFileFlags flags = READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER;
2961         _cleanup_strv_free_ char **search_path = NULL;
2962         _cleanup_(erase_and_freep) char *data = NULL;
2963         _cleanup_free_ char *bindname = NULL;
2964         const char *source = NULL;
2965         bool missing_ok = true;
2966         size_t size, maxsz;
2967         int r;
2968
2969         assert(context);
2970         assert(params);
2971         assert(id);
2972         assert(path);
2973         assert(unit);
2974         assert(read_dfd >= 0 || read_dfd == AT_FDCWD);
2975         assert(write_dfd >= 0);
2976         assert(left);
2977
2978         if (read_dfd >= 0) {
2979                 /* If a directory fd is specified, then read the file directly from that dir. In this case we
2980                  * won't do AF_UNIX stuff (we simply don't want to recursively iterate down a tree of AF_UNIX
2981                  * IPC sockets). It's OK if a file vanishes here in the time we enumerate it and intend to
2982                  * open it. */
2983
2984                 if (!filename_is_valid(path)) /* safety check */
2985                         return -EINVAL;
2986
2987                 missing_ok = true;
2988                 source = path;
2989
2990         } else if (path_is_absolute(path)) {
2991                 /* If this is an absolute path, read the data directly from it, and support AF_UNIX
2992                  * sockets */
2993
2994                 if (!path_is_valid(path)) /* safety check */
2995                         return -EINVAL;
2996
2997                 flags |= READ_FULL_FILE_CONNECT_SOCKET;
2998
2999                 /* Pass some minimal info about the unit and the credential name we are looking to acquire
3000                  * via the source socket address in case we read off an AF_UNIX socket. */
3001                 if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, id) < 0)
3002                         return -ENOMEM;
3003
3004                 missing_ok = false;
3005                 source = path;
3006
3007         } else if (credential_name_valid(path)) {
3008                 /* If this is a relative path, take it as credential name relative to the credentials
3009                  * directory we received ourselves. We don't support the AF_UNIX stuff in this mode, since we
3010                  * are operating on a credential store, i.e. this is guaranteed to be regular files. */
3011
3012                 search_path = credential_search_path(params, CREDENTIAL_SEARCH_PATH_ALL);
3013                 if (!search_path)
3014                         return -ENOMEM;
3015
3016                 missing_ok = true;
3017         } else
3018                 source = NULL;
3019
3020         if (encrypted)
3021                 flags |= READ_FULL_FILE_UNBASE64;
3022
3023         maxsz = encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX;
3024
3025         if (search_path) {
3026                 STRV_FOREACH(d, search_path) {
3027                         _cleanup_free_ char *j = NULL;
3028
3029                         j = path_join(*d, path);
3030                         if (!j)
3031                                 return -ENOMEM;
3032
3033                         r = read_full_file_full(
3034                                         AT_FDCWD, j, /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
3035                                         UINT64_MAX,
3036                                         maxsz,
3037                                         flags,
3038                                         NULL,
3039                                         &data, &size);
3040                         if (r != -ENOENT)
3041                                 break;
3042                 }
3043         } else if (source)
3044                 r = read_full_file_full(
3045                                 read_dfd, source,
3046                                 UINT64_MAX,
3047                                 maxsz,
3048                                 flags,
3049                                 bindname,
3050                                 &data, &size);
3051         else
3052                 r = -ENOENT;
3053
3054         if (r == -ENOENT && (missing_ok || hashmap_contains(context->set_credentials, id))) {
3055                 /* Make a missing inherited credential non-fatal, let's just continue. After all apps
3056                  * will get clear errors if we don't pass such a missing credential on as they
3057                  * themselves will get ENOENT when trying to read them, which should not be much
3058                  * worse than when we handle the error here and make it fatal.
3059                  *
3060                  * Also, if the source file doesn't exist, but a fallback is set via SetCredentials=
3061                  * we are fine, too. */
3062                 log_debug_errno(r, "Couldn't read inherited credential '%s', skipping: %m", path);
3063                 return 0;
3064         }
3065         if (r < 0)
3066                 return log_debug_errno(r, "Failed to read credential '%s': %m", path);
3067
3068         return maybe_decrypt_and_write_credential(write_dfd, id, encrypted, uid, ownership_ok, data, size, left);
3069 }
3070
3071 struct load_cred_args {
3072         const ExecContext *context;
3073         const ExecParameters *params;
3074         bool encrypted;
3075         const char *unit;
3076         int dfd;
3077         uid_t uid;
3078         bool ownership_ok;
3079         uint64_t *left;
3080 };
3081
3082 static int load_cred_recurse_dir_cb(
3083                 RecurseDirEvent event,
3084                 const char *path,
3085                 int dir_fd,
3086                 int inode_fd,
3087                 const struct dirent *de,
3088                 const struct statx *sx,
3089                 void *userdata) {
3090
3091         struct load_cred_args *args = ASSERT_PTR(userdata);
3092         _cleanup_free_ char *sub_id = NULL;
3093         int r;
3094
3095         if (event != RECURSE_DIR_ENTRY)
3096                 return RECURSE_DIR_CONTINUE;
3097
3098         if (!IN_SET(de->d_type, DT_REG, DT_SOCK))
3099                 return RECURSE_DIR_CONTINUE;
3100
3101         sub_id = strreplace(path, "/", "_");
3102         if (!sub_id)
3103                 return -ENOMEM;
3104
3105         if (!credential_name_valid(sub_id))
3106                 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Credential would get ID %s, which is not valid, refusing", sub_id);
3107
3108         if (faccessat(args->dfd, sub_id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0) {
3109                 log_debug("Skipping credential with duplicated ID %s at %s", sub_id, path);
3110                 return RECURSE_DIR_CONTINUE;
3111         }
3112         if (errno != ENOENT)
3113                 return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sub_id);
3114
3115         r = load_credential(
3116                         args->context,
3117                         args->params,
3118                         sub_id,
3119                         de->d_name,
3120                         args->encrypted,
3121                         args->unit,
3122                         dir_fd,
3123                         args->dfd,
3124                         args->uid,
3125                         args->ownership_ok,
3126                         args->left);
3127         if (r < 0)
3128                 return r;
3129
3130         return RECURSE_DIR_CONTINUE;
3131 }
3132
3133 static int acquire_credentials(
3134                 const ExecContext *context,
3135                 const ExecParameters *params,
3136                 const char *unit,
3137                 const char *p,
3138                 uid_t uid,
3139                 bool ownership_ok) {
3140
3141         uint64_t left = CREDENTIALS_TOTAL_SIZE_MAX;
3142         _cleanup_close_ int dfd = -EBADF;
3143         const char *ic;
3144         ExecLoadCredential *lc;
3145         ExecSetCredential *sc;
3146         int r;
3147
3148         assert(context);
3149         assert(p);
3150
3151         dfd = open(p, O_DIRECTORY|O_CLOEXEC);
3152         if (dfd < 0)
3153                 return -errno;
3154
3155         r = fd_acl_make_writable(dfd); /* Add the "w" bit, if we are reusing an already set up credentials dir where it was unset */
3156         if (r < 0)
3157                 return r;
3158
3159         /* First, load credentials off disk (or acquire via AF_UNIX socket) */
3160         HASHMAP_FOREACH(lc, context->load_credentials) {
3161                 _cleanup_close_ int sub_fd = -EBADF;
3162
3163                 /* If this is an absolute path, then try to open it as a directory. If that works, then we'll
3164                  * recurse into it. If it is an absolute path but it isn't a directory, then we'll open it as
3165                  * a regular file. Finally, if it's a relative path we will use it as a credential name to
3166                  * propagate a credential passed to us from further up. */
3167
3168                 if (path_is_absolute(lc->path)) {
3169                         sub_fd = open(lc->path, O_DIRECTORY|O_CLOEXEC|O_RDONLY);
3170                         if (sub_fd < 0 && !IN_SET(errno,
3171                                                   ENOTDIR,  /* Not a directory */
3172                                                   ENOENT))  /* Doesn't exist? */
3173                                 return log_debug_errno(errno, "Failed to open '%s': %m", lc->path);
3174                 }
3175
3176                 if (sub_fd < 0)
3177                         /* Regular file (incl. a credential passed in from higher up) */
3178                         r = load_credential(
3179                                         context,
3180                                         params,
3181                                         lc->id,
3182                                         lc->path,
3183                                         lc->encrypted,
3184                                         unit,
3185                                         AT_FDCWD,
3186                                         dfd,
3187                                         uid,
3188                                         ownership_ok,
3189                                         &left);
3190                 else
3191                         /* Directory */
3192                         r = recurse_dir(
3193                                         sub_fd,
3194                                         /* path= */ lc->id, /* recurse_dir() will suffix the subdir paths from here to the top-level id */
3195                                         /* statx_mask= */ 0,
3196                                         /* n_depth_max= */ UINT_MAX,
3197                                         RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT|RECURSE_DIR_ENSURE_TYPE,
3198                                         load_cred_recurse_dir_cb,
3199                                         &(struct load_cred_args) {
3200                                                 .context = context,
3201                                                 .params = params,
3202                                                 .encrypted = lc->encrypted,
3203                                                 .unit = unit,
3204                                                 .dfd = dfd,
3205                                                 .uid = uid,
3206                                                 .ownership_ok = ownership_ok,
3207                                                 .left = &left,
3208                                         });
3209                 if (r < 0)
3210                         return r;
3211         }
3212
3213         /* Next, look for system credentials and credentials in the credentials store. Note that these do not
3214          * override any credentials found earlier. */
3215         SET_FOREACH(ic, context->import_credentials) {
3216                 _cleanup_free_ char **search_path = NULL;
3217
3218                 search_path = credential_search_path(params, CREDENTIAL_SEARCH_PATH_TRUSTED);
3219                 if (!search_path)
3220                         return -ENOMEM;
3221
3222                 r = load_credential_glob(
3223                                 ic,
3224                                 /* encrypted = */ false,
3225                                 search_path,
3226                                 READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER,
3227                                 dfd,
3228                                 uid,
3229                                 ownership_ok,
3230                                 &left);
3231                 if (r < 0)
3232                         return r;
3233
3234                 search_path = strv_free(search_path);
3235                 search_path = credential_search_path(params, CREDENTIAL_SEARCH_PATH_ENCRYPTED);
3236                 if (!search_path)
3237                         return -ENOMEM;
3238
3239                 r = load_credential_glob(
3240                                 ic,
3241                                 /* encrypted = */ true,
3242                                 search_path,
3243                                 READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER|READ_FULL_FILE_UNBASE64,
3244                                 dfd,
3245                                 uid,
3246                                 ownership_ok,
3247                                 &left);
3248                 if (r < 0)
3249                         return r;
3250         }
3251
3252         /* Finally, we add in literally specified credentials. If the credentials already exist, we'll not
3253          * add them, so that they can act as a "default" if the same credential is specified multiple times. */
3254         HASHMAP_FOREACH(sc, context->set_credentials) {
3255                 _cleanup_(erase_and_freep) void *plaintext = NULL;
3256                 const char *data;
3257                 size_t size, add;
3258
3259                 /* Note that we check ahead of time here instead of relying on O_EXCL|O_CREAT later to return
3260                  * EEXIST if the credential already exists. That's because the TPM2-based decryption is kinda
3261                  * slow and involved, hence it's nice to be able to skip that if the credential already
3262                  * exists anyway. */
3263                 if (faccessat(dfd, sc->id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0)
3264                         continue;
3265                 if (errno != ENOENT)
3266                         return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sc->id);
3267
3268                 if (sc->encrypted) {
3269                         r = decrypt_credential_and_warn(sc->id, now(CLOCK_REALTIME), NULL, NULL, sc->data, sc->size, &plaintext, &size);
3270                         if (r < 0)
3271                                 return r;
3272
3273                         data = plaintext;
3274                 } else {
3275                         data = sc->data;
3276                         size = sc->size;
3277                 }
3278
3279                 add = strlen(sc->id) + size;
3280                 if (add > left)
3281                         return -E2BIG;
3282
3283                 r = write_credential(dfd, sc->id, data, size, uid, ownership_ok);
3284                 if (r < 0)
3285                         return r;
3286
3287                 left -= add;
3288         }
3289
3290         r = fd_acl_make_read_only(dfd); /* Now take away the "w" bit */
3291         if (r < 0)
3292                 return r;
3293
3294         /* After we created all keys with the right perms, also make sure the credential store as a whole is
3295          * accessible */
3296
3297         if (uid_is_valid(uid) && uid != getuid()) {
3298                 r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE);
3299                 if (r < 0) {
3300                         if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
3301                                 return r;
3302
3303                         if (!ownership_ok)
3304                                 return r;
3305
3306                         if (fchown(dfd, uid, GID_INVALID) < 0)
3307                                 return -errno;
3308                 }
3309         }
3310
3311         return 0;
3312 }
3313
3314 static int setup_credentials_internal(
3315                 const ExecContext *context,
3316                 const ExecParameters *params,
3317                 const char *unit,
3318                 const char *final,        /* This is where the credential store shall eventually end up at */
3319                 const char *workspace,    /* This is where we can prepare it before moving it to the final place */
3320                 bool reuse_workspace,     /* Whether to reuse any existing workspace mount if it already is a mount */
3321                 bool must_mount,          /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
3322                 uid_t uid) {
3323
3324         int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
3325                                    * if we mounted something; false if we definitely can't mount anything */
3326         bool final_mounted;
3327         const char *where;
3328
3329         assert(context);
3330         assert(final);
3331         assert(workspace);
3332
3333         if (reuse_workspace) {
3334                 r = path_is_mount_point(workspace, NULL, 0);
3335                 if (r < 0)
3336                         return r;
3337                 if (r > 0)
3338                         workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse it, let's keep this in mind */
3339                 else
3340                         workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
3341         } else
3342                 workspace_mounted = -1; /* ditto */
3343
3344         r = path_is_mount_point(final, NULL, 0);
3345         if (r < 0)
3346                 return r;
3347         if (r > 0) {
3348                 /* If the final place already has something mounted, we use that. If the workspace also has
3349                  * something mounted we assume it's actually the same mount (but with MS_RDONLY
3350                  * different). */
3351                 final_mounted = true;
3352
3353                 if (workspace_mounted < 0) {
3354                         /* If the final place is mounted, but the workspace isn't, then let's bind mount
3355                          * the final version to the workspace, and make it writable, so that we can make
3356                          * changes */
3357
3358                         r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
3359                         if (r < 0)
3360                                 return r;
3361
3362                         r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false), NULL);
3363                         if (r < 0)
3364                                 return r;
3365
3366                         workspace_mounted = true;
3367                 }
3368         } else
3369                 final_mounted = false;
3370
3371         if (workspace_mounted < 0) {
3372                 /* Nothing is mounted on the workspace yet, let's try to mount something now */
3373
3374                 r = mount_credentials_fs(workspace, CREDENTIALS_TOTAL_SIZE_MAX, /* ro= */ false);
3375                 if (r < 0) {
3376                         /* If that didn't work, try to make a bind mount from the final to the workspace, so that we can make it writable there. */
3377                         r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
3378                         if (r < 0) {
3379                                 if (!ERRNO_IS_PRIVILEGE(r)) /* Propagate anything that isn't a permission problem */
3380                                         return r;
3381
3382                                 if (must_mount) /* If we it's not OK to use the plain directory
3383                                                  * fallback, propagate all errors too */
3384                                         return r;
3385
3386                                 /* If we lack privileges to bind mount stuff, then let's gracefully
3387                                  * proceed for compat with container envs, and just use the final dir
3388                                  * as is. */
3389
3390                                 workspace_mounted = false;
3391                         } else {
3392                                 /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
3393                                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false), NULL);
3394                                 if (r < 0)
3395                                         return r;
3396
3397                                 workspace_mounted = true;
3398                         }
3399                 } else
3400                         workspace_mounted = true;
3401         }
3402
3403         assert(!must_mount || workspace_mounted > 0);
3404         where = workspace_mounted ? workspace : final;
3405
3406         (void) label_fix_full(AT_FDCWD, where, final, 0);
3407
3408         r = acquire_credentials(context, params, unit, where, uid, workspace_mounted);
3409         if (r < 0)
3410                 return r;
3411
3412         if (workspace_mounted) {
3413                 bool install;
3414
3415                 /* Determine if we should actually install the prepared mount in the final location by bind
3416                  * mounting it there. We do so only if the mount is not established there already, and if the
3417                  * mount is actually non-empty (i.e. carries at least one credential). Not that in the best
3418                  * case we are doing all this in a mount namespace, thus no one else will see that we
3419                  * allocated a file system we are getting rid of again here. */
3420                 if (final_mounted)
3421                         install = false; /* already installed */
3422                 else {
3423                         r = dir_is_empty(where, /* ignore_hidden_or_backup= */ false);
3424                         if (r < 0)
3425                                 return r;
3426
3427                         install = r == 0; /* install only if non-empty */
3428                 }
3429
3430                 if (install) {
3431                         /* Make workspace read-only now, so that any bind mount we make from it defaults to read-only too */
3432                         r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ true), NULL);
3433                         if (r < 0)
3434                                 return r;
3435
3436                         /* And mount it to the final place, read-only */
3437                         r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
3438                 } else
3439                         /* Otherwise get rid of it */
3440                         r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
3441                 if (r < 0)
3442                         return r;
3443         } else {
3444                 _cleanup_free_ char *parent = NULL;
3445
3446                 /* If we do not have our own mount put used the plain directory fallback, then we need to
3447                  * open access to the top-level credential directory and the per-service directory now */
3448
3449                 r = path_extract_directory(final, &parent);
3450                 if (r < 0)
3451                         return r;
3452                 if (chmod(parent, 0755) < 0)
3453                         return -errno;
3454         }
3455
3456         return 0;
3457 }
3458
3459 static int setup_credentials(
3460                 const ExecContext *context,
3461                 const ExecParameters *params,
3462                 const char *unit,
3463                 uid_t uid) {
3464
3465         _cleanup_free_ char *p = NULL, *q = NULL;
3466         int r;
3467
3468         assert(context);
3469         assert(params);
3470
3471         if (!exec_context_has_credentials(context))
3472                 return 0;
3473
3474         if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
3475                 return -EINVAL;
3476
3477         /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
3478          * and the subdir we mount over with a read-only file system readable by the service's user */
3479         q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials");
3480         if (!q)
3481                 return -ENOMEM;
3482
3483         r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */
3484         if (r < 0 && r != -EEXIST)
3485                 return r;
3486
3487         p = path_join(q, unit);
3488         if (!p)
3489                 return -ENOMEM;
3490
3491         r = mkdir_label(p, 0700); /* per-unit dir: private to user */
3492         if (r < 0 && r != -EEXIST)
3493                 return r;
3494
3495         r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
3496         if (r < 0) {
3497                 _cleanup_free_ char *t = NULL, *u = NULL;
3498
3499                 /* If this is not a privilege or support issue then propagate the error */
3500                 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
3501                         return r;
3502
3503                 /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
3504                  * it into place, so that users can't access half-initialized credential stores. */
3505                 t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
3506                 if (!t)
3507                         return -ENOMEM;
3508
3509                 /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
3510                  * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
3511                  * after it is fully set up */
3512                 u = path_join(t, unit);
3513                 if (!u)
3514                         return -ENOMEM;
3515
3516                 FOREACH_STRING(i, t, u) {
3517                         r = mkdir_label(i, 0700);
3518                         if (r < 0 && r != -EEXIST)
3519                                 return r;
3520                 }
3521
3522                 r = setup_credentials_internal(
3523                                 context,
3524                                 params,
3525                                 unit,
3526                                 p,       /* final mount point */
3527                                 u,       /* temporary workspace to overmount */
3528                                 true,    /* reuse the workspace if it is already a mount */
3529                                 false,   /* it's OK to fall back to a plain directory if we can't mount anything */
3530                                 uid);
3531
3532                 (void) rmdir(u); /* remove the workspace again if we can. */
3533
3534                 if (r < 0)
3535                         return r;
3536
3537         } else if (r == 0) {
3538
3539                 /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
3540                  * we can use the same directory for all cases, after turning off propagation. Question
3541                  * though is: where do we turn off propagation exactly, and where do we place the workspace
3542                  * directory? We need some place that is guaranteed to be a mount point in the host, and
3543                  * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
3544                  * since we ultimately want to move the resulting file system there, i.e. we need propagation
3545                  * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
3546                  * would be visible in the host mount table all the time, which we want to avoid. Hence, what
3547                  * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
3548                  * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
3549                  * propagation on the former, and then overmount the latter.
3550                  *
3551                  * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
3552                  * for this purpose, but there are few other candidates that work equally well for us, and
3553                  * given that the we do this in a privately namespaced short-lived single-threaded process
3554                  * that no one else sees this should be OK to do. */
3555
3556                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL); /* Turn off propagation from our namespace to host */
3557                 if (r < 0)
3558                         goto child_fail;
3559
3560                 r = setup_credentials_internal(
3561                                 context,
3562                                 params,
3563                                 unit,
3564                                 p,           /* final mount point */
3565                                 "/dev/shm",  /* temporary workspace to overmount */
3566                                 false,       /* do not reuse /dev/shm if it is already a mount, under no circumstances */
3567                                 true,        /* insist that something is mounted, do not allow fallback to plain directory */
3568                                 uid);
3569                 if (r < 0)
3570                         goto child_fail;
3571
3572                 _exit(EXIT_SUCCESS);
3573
3574         child_fail:
3575                 _exit(EXIT_FAILURE);
3576         }
3577
3578         /* If the credentials dir is empty and not a mount point, then there's no point in having it. Let's
3579          * try to remove it. This matters in particular if we created the dir as mount point but then didn't
3580          * actually end up mounting anything on it. In that case we'd rather have ENOENT than EACCESS being
3581          * seen by users when trying access this inode. */
3582         (void) rmdir(p);
3583         return 0;
3584 }
3585
3586 #if ENABLE_SMACK
3587 static int setup_smack(
3588                 const Manager *manager,
3589                 const ExecContext *context,
3590                 int executable_fd) {
3591         int r;
3592
3593         assert(context);
3594         assert(executable_fd >= 0);
3595
3596         if (context->smack_process_label) {
3597                 r = mac_smack_apply_pid(0, context->smack_process_label);
3598                 if (r < 0)
3599                         return r;
3600         } else if (manager->default_smack_process_label) {
3601                 _cleanup_free_ char *exec_label = NULL;
3602
3603                 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
3604                 if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
3605                         return r;
3606
3607                 r = mac_smack_apply_pid(0, exec_label ?: manager->default_smack_process_label);
3608                 if (r < 0)
3609                         return r;
3610         }
3611
3612         return 0;
3613 }
3614 #endif
3615
3616 static int compile_bind_mounts(
3617                 const ExecContext *context,
3618                 const ExecParameters *params,
3619                 BindMount **ret_bind_mounts,
3620                 size_t *ret_n_bind_mounts,
3621                 char ***ret_empty_directories) {
3622
3623         _cleanup_strv_free_ char **empty_directories = NULL;
3624         BindMount *bind_mounts = NULL;
3625         size_t n, h = 0;
3626         int r;
3627
3628         assert(context);
3629         assert(params);
3630         assert(ret_bind_mounts);
3631         assert(ret_n_bind_mounts);
3632         assert(ret_empty_directories);
3633
3634         CLEANUP_ARRAY(bind_mounts, h, bind_mount_free_many);
3635
3636         n = context->n_bind_mounts;
3637         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3638                 if (!params->prefix[t])
3639                         continue;
3640
3641                 for (size_t i = 0; i < context->directories[t].n_items; i++)
3642                         n += !context->directories[t].items[i].only_create;
3643         }
3644
3645         if (n <= 0) {
3646                 *ret_bind_mounts = NULL;
3647                 *ret_n_bind_mounts = 0;
3648                 *ret_empty_directories = NULL;
3649                 return 0;
3650         }
3651
3652         bind_mounts = new(BindMount, n);
3653         if (!bind_mounts)
3654                 return -ENOMEM;
3655
3656         for (size_t i = 0; i < context->n_bind_mounts; i++) {
3657                 BindMount *item = context->bind_mounts + i;
3658                 _cleanup_free_ char *s = NULL, *d = NULL;
3659
3660                 s = strdup(item->source);
3661                 if (!s)
3662                         return -ENOMEM;
3663
3664                 d = strdup(item->destination);
3665                 if (!d)
3666                         return -ENOMEM;
3667
3668                 bind_mounts[h++] = (BindMount) {
3669                         .source = TAKE_PTR(s),
3670                         .destination = TAKE_PTR(d),
3671                         .read_only = item->read_only,
3672                         .recursive = item->recursive,
3673                         .ignore_enoent = item->ignore_enoent,
3674                 };
3675         }
3676
3677         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3678                 if (!params->prefix[t])
3679                         continue;
3680
3681                 if (context->directories[t].n_items == 0)
3682                         continue;
3683
3684                 if (exec_directory_is_private(context, t) &&
3685                     !exec_context_with_rootfs(context)) {
3686                         char *private_root;
3687
3688                         /* So this is for a dynamic user, and we need to make sure the process can access its own
3689                          * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3690                          * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3691
3692                         private_root = path_join(params->prefix[t], "private");
3693                         if (!private_root)
3694                                 return -ENOMEM;
3695
3696                         r = strv_consume(&empty_directories, private_root);
3697                         if (r < 0)
3698                                 return r;
3699                 }
3700
3701                 for (size_t i = 0; i < context->directories[t].n_items; i++) {
3702                         _cleanup_free_ char *s = NULL, *d = NULL;
3703
3704                         /* When one of the parent directories is in the list, we cannot create the symlink
3705                          * for the child directory. See also the comments in setup_exec_directory(). */
3706                         if (context->directories[t].items[i].only_create)
3707                                 continue;
3708
3709                         if (exec_directory_is_private(context, t))
3710                                 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
3711                         else
3712                                 s = path_join(params->prefix[t], context->directories[t].items[i].path);
3713                         if (!s)
3714                                 return -ENOMEM;
3715
3716                         if (exec_directory_is_private(context, t) &&
3717                             exec_context_with_rootfs(context))
3718                                 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3719                                  * directory is not created on the root directory. So, let's bind-mount the directory
3720                                  * on the 'non-private' place. */
3721                                 d = path_join(params->prefix[t], context->directories[t].items[i].path);
3722                         else
3723                                 d = strdup(s);
3724                         if (!d)
3725                                 return -ENOMEM;
3726
3727                         bind_mounts[h++] = (BindMount) {
3728                                 .source = TAKE_PTR(s),
3729                                 .destination = TAKE_PTR(d),
3730                                 .read_only = false,
3731                                 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
3732                                 .recursive = true,
3733                                 .ignore_enoent = false,
3734                         };
3735                 }
3736         }
3737
3738         assert(h == n);
3739
3740         *ret_bind_mounts = TAKE_PTR(bind_mounts);
3741         *ret_n_bind_mounts = n;
3742         *ret_empty_directories = TAKE_PTR(empty_directories);
3743
3744         return (int) n;
3745 }
3746
3747 /* ret_symlinks will contain a list of pairs src:dest that describes
3748  * the symlinks to create later on. For example, the symlinks needed
3749  * to safely give private directories to DynamicUser=1 users. */
3750 static int compile_symlinks(
3751                 const ExecContext *context,
3752                 const ExecParameters *params,
3753                 char ***ret_symlinks) {
3754
3755         _cleanup_strv_free_ char **symlinks = NULL;
3756         int r;
3757
3758         assert(context);
3759         assert(params);
3760         assert(ret_symlinks);
3761
3762         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3763                 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
3764                         _cleanup_free_ char *private_path = NULL, *path = NULL;
3765
3766                         STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
3767                                 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
3768
3769                                 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3770                                 dst_abs = path_join(params->prefix[dt], *symlink);
3771                                 if (!src_abs || !dst_abs)
3772                                         return -ENOMEM;
3773
3774                                 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
3775                                 if (r < 0)
3776                                         return r;
3777                         }
3778
3779                         if (!exec_directory_is_private(context, dt) ||
3780                             exec_context_with_rootfs(context) ||
3781                             context->directories[dt].items[i].only_create)
3782                                 continue;
3783
3784                         private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
3785                         if (!private_path)
3786                                 return -ENOMEM;
3787
3788                         path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3789                         if (!path)
3790                                 return -ENOMEM;
3791
3792                         r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
3793                         if (r < 0)
3794                                 return r;
3795                 }
3796         }
3797
3798         *ret_symlinks = TAKE_PTR(symlinks);
3799
3800         return 0;
3801 }
3802
3803 static bool insist_on_sandboxing(
3804                 const ExecContext *context,
3805                 const char *root_dir,
3806                 const char *root_image,
3807                 const BindMount *bind_mounts,
3808                 size_t n_bind_mounts) {
3809
3810         assert(context);
3811         assert(n_bind_mounts == 0 || bind_mounts);
3812
3813         /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
3814          * would alter the view on the file system beyond making things read-only or invisible, i.e. would
3815          * rearrange stuff in a way we cannot ignore gracefully. */
3816
3817         if (context->n_temporary_filesystems > 0)
3818                 return true;
3819
3820         if (root_dir || root_image)
3821                 return true;
3822
3823         if (context->n_mount_images > 0)
3824                 return true;
3825
3826         if (context->dynamic_user)
3827                 return true;
3828
3829         if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
3830                 return true;
3831
3832         /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3833          * essential. */
3834         for (size_t i = 0; i < n_bind_mounts; i++)
3835                 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
3836                         return true;
3837
3838         if (context->log_namespace)
3839                 return true;
3840
3841         return false;
3842 }
3843
3844 static int setup_ephemeral(const ExecContext *context, ExecRuntime *runtime) {
3845         _cleanup_close_ int fd = -EBADF;
3846         int r;
3847
3848         if (!runtime || !runtime->ephemeral_copy)
3849                 return 0;
3850
3851         r = posix_lock(runtime->ephemeral_storage_socket[0], LOCK_EX);
3852         if (r < 0)
3853                 return log_debug_errno(r, "Failed to lock ephemeral storage socket: %m");
3854
3855         CLEANUP_POSIX_UNLOCK(runtime->ephemeral_storage_socket[0]);
3856
3857         fd = receive_one_fd(runtime->ephemeral_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
3858         if (fd >= 0)
3859                 /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */
3860                 return 0;
3861
3862         if (fd != -EAGAIN)
3863                 return log_debug_errno(fd, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
3864
3865         log_debug("Making ephemeral snapshot of %s to %s",
3866                   context->root_image ?: context->root_directory, runtime->ephemeral_copy);
3867
3868         if (context->root_image)
3869                 fd = copy_file(context->root_image, runtime->ephemeral_copy, O_EXCL, 0600,
3870                                COPY_LOCK_BSD|COPY_REFLINK|COPY_CRTIME);
3871         else
3872                 fd = btrfs_subvol_snapshot_at(AT_FDCWD, context->root_directory,
3873                                               AT_FDCWD, runtime->ephemeral_copy,
3874                                               BTRFS_SNAPSHOT_FALLBACK_COPY |
3875                                               BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
3876                                               BTRFS_SNAPSHOT_RECURSIVE |
3877                                               BTRFS_SNAPSHOT_LOCK_BSD);
3878         if (fd < 0)
3879                 return log_debug_errno(fd, "Failed to snapshot %s to %s: %m",
3880                                        context->root_image ?: context->root_directory, runtime->ephemeral_copy);
3881
3882         if (context->root_image) {
3883                 /* A root image might be subject to lots of random writes so let's try to disable COW on it
3884                  * which tends to not perform well in combination with lots of random writes.
3885                  *
3886                  * Note: btrfs actually isn't impressed by us setting the flag after making the reflink'ed
3887                  * copy, but we at least want to make the intention clear.
3888                  */
3889                 r = chattr_fd(fd, FS_NOCOW_FL, FS_NOCOW_FL, NULL);
3890                 if (r < 0)
3891                         log_debug_errno(fd, "Failed to disable copy-on-write for %s, ignoring: %m", runtime->ephemeral_copy);
3892         }
3893
3894         r = send_one_fd(runtime->ephemeral_storage_socket[1], fd, MSG_DONTWAIT);
3895         if (r < 0)
3896                 return log_debug_errno(r, "Failed to queue file descriptor on ephemeral storage socket: %m");
3897
3898         return 1;
3899 }
3900
3901 static int verity_settings_prepare(
3902                 VeritySettings *verity,
3903                 const char *root_image,
3904                 const void *root_hash,
3905                 size_t root_hash_size,
3906                 const char *root_hash_path,
3907                 const void *root_hash_sig,
3908                 size_t root_hash_sig_size,
3909                 const char *root_hash_sig_path,
3910                 const char *verity_data_path) {
3911
3912         int r;
3913
3914         assert(verity);
3915
3916         if (root_hash) {
3917                 void *d;
3918
3919                 d = memdup(root_hash, root_hash_size);
3920                 if (!d)
3921                         return -ENOMEM;
3922
3923                 free_and_replace(verity->root_hash, d);
3924                 verity->root_hash_size = root_hash_size;
3925                 verity->designator = PARTITION_ROOT;
3926         }
3927
3928         if (root_hash_sig) {
3929                 void *d;
3930
3931                 d = memdup(root_hash_sig, root_hash_sig_size);
3932                 if (!d)
3933                         return -ENOMEM;
3934
3935                 free_and_replace(verity->root_hash_sig, d);
3936                 verity->root_hash_sig_size = root_hash_sig_size;
3937                 verity->designator = PARTITION_ROOT;
3938         }
3939
3940         if (verity_data_path) {
3941                 r = free_and_strdup(&verity->data_path, verity_data_path);
3942                 if (r < 0)
3943                         return r;
3944         }
3945
3946         r = verity_settings_load(
3947                         verity,
3948                         root_image,
3949                         root_hash_path,
3950                         root_hash_sig_path);
3951         if (r < 0)
3952                 return log_debug_errno(r, "Failed to load root hash: %m");
3953
3954         return 0;
3955 }
3956
3957 static int apply_mount_namespace(
3958                 const Unit *u,
3959                 ExecCommandFlags command_flags,
3960                 const ExecContext *context,
3961                 const ExecParameters *params,
3962                 ExecRuntime *runtime,
3963                 const char *memory_pressure_path,
3964                 char **error_path) {
3965
3966         _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
3967         _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
3968                         **read_write_paths_cleanup = NULL;
3969         _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
3970                         *extension_dir = NULL, *host_os_release = NULL;
3971         const char *root_dir = NULL, *root_image = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
3972         char **read_write_paths;
3973         NamespaceInfo ns_info;
3974         bool needs_sandboxing;
3975         BindMount *bind_mounts = NULL;
3976         size_t n_bind_mounts = 0;
3977         int r;
3978
3979         assert(context);
3980
3981         CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many);
3982
3983         if (params->flags & EXEC_APPLY_CHROOT) {
3984                 r = setup_ephemeral(context, runtime);
3985                 if (r < 0)
3986                         return r;
3987
3988                 if (context->root_image)
3989                         root_image = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_image;
3990                 else
3991                         root_dir = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory;
3992         }
3993
3994         r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3995         if (r < 0)
3996                 return r;
3997
3998         /* Symlinks for exec dirs are set up after other mounts, before they are made read-only. */
3999         r = compile_symlinks(context, params, &symlinks);
4000         if (r < 0)
4001                 return r;
4002
4003         /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
4004          * service will need to write to it in order to start the notifications. */
4005         if (context->protect_control_groups && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
4006                 read_write_paths_cleanup = strv_copy(context->read_write_paths);
4007                 if (!read_write_paths_cleanup)
4008                         return -ENOMEM;
4009
4010                 r = strv_extend(&read_write_paths_cleanup, memory_pressure_path);
4011                 if (r < 0)
4012                         return r;
4013
4014                 read_write_paths = read_write_paths_cleanup;
4015         } else
4016                 read_write_paths = context->read_write_paths;
4017
4018         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
4019         if (needs_sandboxing) {
4020                 /* The runtime struct only contains the parent of the private /tmp,
4021                  * which is non-accessible to world users. Inside of it there's a /tmp
4022                  * that is sticky, and that's the one we want to use here.
4023                  * This does not apply when we are using /run/systemd/empty as fallback. */
4024
4025                 if (context->private_tmp && runtime && runtime->shared) {
4026                         if (streq_ptr(runtime->shared->tmp_dir, RUN_SYSTEMD_EMPTY))
4027                                 tmp_dir = runtime->shared->tmp_dir;
4028                         else if (runtime->shared->tmp_dir)
4029                                 tmp_dir = strjoina(runtime->shared->tmp_dir, "/tmp");
4030
4031                         if (streq_ptr(runtime->shared->var_tmp_dir, RUN_SYSTEMD_EMPTY))
4032                                 var_tmp_dir = runtime->shared->var_tmp_dir;
4033                         else if (runtime->shared->var_tmp_dir)
4034                                 var_tmp_dir = strjoina(runtime->shared->var_tmp_dir, "/tmp");
4035                 }
4036
4037                 ns_info = (NamespaceInfo) {
4038                         .ignore_protect_paths = false,
4039                         .private_dev = context->private_devices,
4040                         .protect_control_groups = context->protect_control_groups,
4041                         .protect_kernel_tunables = context->protect_kernel_tunables,
4042                         .protect_kernel_modules = context->protect_kernel_modules,
4043                         .protect_kernel_logs = context->protect_kernel_logs,
4044                         .protect_hostname = context->protect_hostname,
4045                         .mount_apivfs = exec_context_get_effective_mount_apivfs(context),
4046                         .protect_home = context->protect_home,
4047                         .protect_system = context->protect_system,
4048                         .protect_proc = context->protect_proc,
4049                         .proc_subset = context->proc_subset,
4050                         .private_network = exec_needs_network_namespace(context),
4051                         .private_ipc = exec_needs_ipc_namespace(context),
4052                         /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
4053                         .mount_nosuid = context->no_new_privileges && !mac_selinux_use(),
4054                 };
4055         } else if (!context->dynamic_user && root_dir)
4056                 /*
4057                  * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
4058                  * sandbox info, otherwise enforce it, don't ignore protected paths and
4059                  * fail if we are enable to apply the sandbox inside the mount namespace.
4060                  */
4061                 ns_info = (NamespaceInfo) {
4062                         .ignore_protect_paths = true,
4063                 };
4064         else
4065                 ns_info = (NamespaceInfo) {};
4066
4067         if (context->mount_propagation_flag == MS_SHARED)
4068                 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
4069
4070         if (exec_context_has_credentials(context) &&
4071             params->prefix[EXEC_DIRECTORY_RUNTIME] &&
4072             FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4073                 creds_path = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials", u->id);
4074                 if (!creds_path)
4075                         return -ENOMEM;
4076         }
4077
4078         if (params->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
4079                 propagate_dir = path_join("/run/systemd/propagate/", u->id);
4080                 if (!propagate_dir)
4081                         return -ENOMEM;
4082
4083                 incoming_dir = strdup("/run/systemd/incoming");
4084                 if (!incoming_dir)
4085                         return -ENOMEM;
4086
4087                 extension_dir = strdup("/run/systemd/unit-extensions");
4088                 if (!extension_dir)
4089                         return -ENOMEM;
4090
4091                 /* If running under a different root filesystem, propagate the host's os-release. We make a
4092                  * copy rather than just bind mounting it, so that it can be updated on soft-reboot. */
4093                 if (root_dir || root_image) {
4094                         host_os_release = strdup("/run/systemd/propagate/os-release");
4095                         if (!host_os_release)
4096                                 return -ENOMEM;
4097                 }
4098         } else {
4099                 assert(params->runtime_scope == RUNTIME_SCOPE_USER);
4100
4101                 if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0)
4102                         return -ENOMEM;
4103
4104                 if (root_dir || root_image) {
4105                         if (asprintf(&host_os_release, "/run/user/" UID_FMT "/systemd/propagate/os-release", geteuid()) < 0)
4106                                 return -ENOMEM;
4107                 }
4108         }
4109
4110         if (root_image) {
4111                 r = verity_settings_prepare(
4112                         &verity,
4113                         root_image,
4114                         context->root_hash, context->root_hash_size, context->root_hash_path,
4115                         context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
4116                         context->root_verity);
4117                 if (r < 0)
4118                         return r;
4119         }
4120
4121         r = setup_namespace(
4122                         root_dir,
4123                         root_image,
4124                         context->root_image_options,
4125                         context->root_image_policy ?: &image_policy_service,
4126                         &ns_info,
4127                         read_write_paths,
4128                         needs_sandboxing ? context->read_only_paths : NULL,
4129                         needs_sandboxing ? context->inaccessible_paths : NULL,
4130                         needs_sandboxing ? context->exec_paths : NULL,
4131                         needs_sandboxing ? context->no_exec_paths : NULL,
4132                         empty_directories,
4133                         symlinks,
4134                         bind_mounts,
4135                         n_bind_mounts,
4136                         context->temporary_filesystems,
4137                         context->n_temporary_filesystems,
4138                         context->mount_images,
4139                         context->n_mount_images,
4140                         context->mount_image_policy ?: &image_policy_service,
4141                         tmp_dir,
4142                         var_tmp_dir,
4143                         creds_path,
4144                         context->log_namespace,
4145                         context->mount_propagation_flag,
4146                         &verity,
4147                         context->extension_images,
4148                         context->n_extension_images,
4149                         context->extension_image_policy ?: &image_policy_sysext,
4150                         context->extension_directories,
4151                         propagate_dir,
4152                         incoming_dir,
4153                         extension_dir,
4154                         root_dir || root_image ? params->notify_socket : NULL,
4155                         host_os_release,
4156                         error_path);
4157
4158         /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
4159          * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
4160          * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
4161          * completely different execution environment. */
4162         if (r == -ENOANO) {
4163                 if (insist_on_sandboxing(
4164                                     context,
4165                                     root_dir, root_image,
4166                                     bind_mounts,
4167                                     n_bind_mounts))
4168                         return log_unit_debug_errno(u,
4169                                                     SYNTHETIC_ERRNO(EOPNOTSUPP),
4170                                                     "Failed to set up namespace, and refusing to continue since "
4171                                                     "the selected namespacing options alter mount environment non-trivially.\n"
4172                                                     "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
4173                                                     n_bind_mounts,
4174                                                     context->n_temporary_filesystems,
4175                                                     yes_no(root_dir),
4176                                                     yes_no(root_image),
4177                                                     yes_no(context->dynamic_user));
4178
4179                 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
4180                 return 0;
4181         }
4182
4183         return r;
4184 }
4185
4186 static int apply_working_directory(
4187                 const ExecContext *context,
4188                 const ExecParameters *params,
4189                 ExecRuntime *runtime,
4190                 const char *home,
4191                 int *exit_status) {
4192
4193         const char *d, *wd;
4194
4195         assert(context);
4196         assert(exit_status);
4197
4198         if (context->working_directory_home) {
4199
4200                 if (!home) {
4201                         *exit_status = EXIT_CHDIR;
4202                         return -ENXIO;
4203                 }
4204
4205                 wd = home;
4206
4207         } else
4208                 wd = empty_to_root(context->working_directory);
4209
4210         if (params->flags & EXEC_APPLY_CHROOT)
4211                 d = wd;
4212         else
4213                 d = prefix_roota((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory, wd);
4214
4215         if (chdir(d) < 0 && !context->working_directory_missing_ok) {
4216                 *exit_status = EXIT_CHDIR;
4217                 return -errno;
4218         }
4219
4220         return 0;
4221 }
4222
4223 static int apply_root_directory(
4224                 const ExecContext *context,
4225                 const ExecParameters *params,
4226                 ExecRuntime *runtime,
4227                 const bool needs_mount_ns,
4228                 int *exit_status) {
4229
4230         assert(context);
4231         assert(exit_status);
4232
4233         if (params->flags & EXEC_APPLY_CHROOT)
4234                 if (!needs_mount_ns && context->root_directory)
4235                         if (chroot((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory) < 0) {
4236                                 *exit_status = EXIT_CHROOT;
4237                                 return -errno;
4238                         }
4239
4240         return 0;
4241 }
4242
4243 static int setup_keyring(
4244                 const Unit *u,
4245                 const ExecContext *context,
4246                 const ExecParameters *p,
4247                 uid_t uid, gid_t gid) {
4248
4249         key_serial_t keyring;
4250         int r = 0;
4251         uid_t saved_uid;
4252         gid_t saved_gid;
4253
4254         assert(u);
4255         assert(context);
4256         assert(p);
4257
4258         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
4259          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
4260          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
4261          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
4262          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
4263          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
4264
4265         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
4266                 return 0;
4267
4268         /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
4269          * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
4270          * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
4271          * & group is just as nasty as acquiring a reference to the user keyring. */
4272
4273         saved_uid = getuid();
4274         saved_gid = getgid();
4275
4276         if (gid_is_valid(gid) && gid != saved_gid) {
4277                 if (setregid(gid, -1) < 0)
4278                         return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
4279         }
4280
4281         if (uid_is_valid(uid) && uid != saved_uid) {
4282                 if (setreuid(uid, -1) < 0) {
4283                         r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
4284                         goto out;
4285                 }
4286         }
4287
4288         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
4289         if (keyring == -1) {
4290                 if (errno == ENOSYS)
4291                         log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
4292                 else if (ERRNO_IS_PRIVILEGE(errno))
4293                         log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
4294                 else if (errno == EDQUOT)
4295                         log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
4296                 else
4297                         r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
4298
4299                 goto out;
4300         }
4301
4302         /* When requested link the user keyring into the session keyring. */
4303         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
4304
4305                 if (keyctl(KEYCTL_LINK,
4306                            KEY_SPEC_USER_KEYRING,
4307                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
4308                         r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
4309                         goto out;
4310                 }
4311         }
4312
4313         /* Restore uid/gid back */
4314         if (uid_is_valid(uid) && uid != saved_uid) {
4315                 if (setreuid(saved_uid, -1) < 0) {
4316                         r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
4317                         goto out;
4318                 }
4319         }
4320
4321         if (gid_is_valid(gid) && gid != saved_gid) {
4322                 if (setregid(saved_gid, -1) < 0)
4323                         return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
4324         }
4325
4326         /* Populate they keyring with the invocation ID by default, as original saved_uid. */
4327         if (!sd_id128_is_null(u->invocation_id)) {
4328                 key_serial_t key;
4329
4330                 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
4331                 if (key == -1)
4332                         log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
4333                 else {
4334                         if (keyctl(KEYCTL_SETPERM, key,
4335                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
4336                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
4337                                 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
4338                 }
4339         }
4340
4341 out:
4342         /* Revert back uid & gid for the last time, and exit */
4343         /* no extra logging, as only the first already reported error matters */
4344         if (getuid() != saved_uid)
4345                 (void) setreuid(saved_uid, -1);
4346
4347         if (getgid() != saved_gid)
4348                 (void) setregid(saved_gid, -1);
4349
4350         return r;
4351 }
4352
4353 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
4354         assert(array);
4355         assert(n);
4356         assert(pair);
4357
4358         if (pair[0] >= 0)
4359                 array[(*n)++] = pair[0];
4360         if (pair[1] >= 0)
4361                 array[(*n)++] = pair[1];
4362 }
4363
4364 static int close_remaining_fds(
4365                 const ExecParameters *params,
4366                 const ExecRuntime *runtime,
4367                 int user_lookup_fd,
4368                 int socket_fd,
4369                 const int *fds, size_t n_fds) {
4370
4371         size_t n_dont_close = 0;
4372         int dont_close[n_fds + 14];
4373
4374         assert(params);
4375
4376         if (params->stdin_fd >= 0)
4377                 dont_close[n_dont_close++] = params->stdin_fd;
4378         if (params->stdout_fd >= 0)
4379                 dont_close[n_dont_close++] = params->stdout_fd;
4380         if (params->stderr_fd >= 0)
4381                 dont_close[n_dont_close++] = params->stderr_fd;
4382
4383         if (socket_fd >= 0)
4384                 dont_close[n_dont_close++] = socket_fd;
4385         if (n_fds > 0) {
4386                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
4387                 n_dont_close += n_fds;
4388         }
4389
4390         if (runtime)
4391                 append_socket_pair(dont_close, &n_dont_close, runtime->ephemeral_storage_socket);
4392
4393         if (runtime && runtime->shared) {
4394                 append_socket_pair(dont_close, &n_dont_close, runtime->shared->netns_storage_socket);
4395                 append_socket_pair(dont_close, &n_dont_close, runtime->shared->ipcns_storage_socket);
4396         }
4397
4398         if (runtime && runtime->dynamic_creds) {
4399                 if (runtime->dynamic_creds->user)
4400                         append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->user->storage_socket);
4401                 if (runtime->dynamic_creds->group)
4402                         append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->group->storage_socket);
4403         }
4404
4405         if (user_lookup_fd >= 0)
4406                 dont_close[n_dont_close++] = user_lookup_fd;
4407
4408         return close_all_fds(dont_close, n_dont_close);
4409 }
4410
4411 static int send_user_lookup(
4412                 Unit *unit,
4413                 int user_lookup_fd,
4414                 uid_t uid,
4415                 gid_t gid) {
4416
4417         assert(unit);
4418
4419         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
4420          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
4421          * specified. */
4422
4423         if (user_lookup_fd < 0)
4424                 return 0;
4425
4426         if (!uid_is_valid(uid) && !gid_is_valid(gid))
4427                 return 0;
4428
4429         if (writev(user_lookup_fd,
4430                (struct iovec[]) {
4431                            IOVEC_MAKE(&uid, sizeof(uid)),
4432                            IOVEC_MAKE(&gid, sizeof(gid)),
4433                            IOVEC_MAKE_STRING(unit->id) }, 3) < 0)
4434                 return -errno;
4435
4436         return 0;
4437 }
4438
4439 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
4440         int r;
4441
4442         assert(c);
4443         assert(home);
4444         assert(buf);
4445
4446         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
4447
4448         if (*home)
4449                 return 0;
4450
4451         if (!c->working_directory_home)
4452                 return 0;
4453
4454         r = get_home_dir(buf);
4455         if (r < 0)
4456                 return r;
4457
4458         *home = *buf;
4459         return 1;
4460 }
4461
4462 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
4463         _cleanup_strv_free_ char ** list = NULL;
4464         int r;
4465
4466         assert(c);
4467         assert(p);
4468         assert(ret);
4469
4470         assert(c->dynamic_user);
4471
4472         /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
4473          * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
4474          * directories. */
4475
4476         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
4477                 if (t == EXEC_DIRECTORY_CONFIGURATION)
4478                         continue;
4479
4480                 if (!p->prefix[t])
4481                         continue;
4482
4483                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
4484                         char *e;
4485
4486                         if (exec_directory_is_private(c, t))
4487                                 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
4488                         else
4489                                 e = path_join(p->prefix[t], c->directories[t].items[i].path);
4490                         if (!e)
4491                                 return -ENOMEM;
4492
4493                         r = strv_consume(&list, e);
4494                         if (r < 0)
4495                                 return r;
4496                 }
4497         }
4498
4499         *ret = TAKE_PTR(list);
4500
4501         return 0;
4502 }
4503
4504 static int exec_parameters_get_cgroup_path(
4505                 const ExecParameters *params,
4506                 const CGroupContext *c,
4507                 char **ret) {
4508
4509         const char *subgroup = NULL;
4510         char *p;
4511
4512         assert(params);
4513         assert(ret);
4514
4515         if (!params->cgroup_path)
4516                 return -EINVAL;
4517
4518         /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
4519          * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
4520          * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
4521          * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
4522          * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
4523          * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
4524          * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
4525          * flag, which is only passed for the former statements, not for the latter. */
4526
4527         if (FLAGS_SET(params->flags, EXEC_CGROUP_DELEGATE) && (FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP) || c->delegate_subgroup)) {
4528                 if (FLAGS_SET(params->flags, EXEC_IS_CONTROL))
4529                         subgroup = ".control";
4530                 else
4531                         subgroup = c->delegate_subgroup;
4532         }
4533
4534         if (subgroup)
4535                 p = path_join(params->cgroup_path, subgroup);
4536         else
4537                 p = strdup(params->cgroup_path);
4538         if (!p)
4539                 return -ENOMEM;
4540
4541         *ret = p;
4542         return !!subgroup;
4543 }
4544
4545 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
4546         _cleanup_(cpu_set_reset) CPUSet s = {};
4547         int r;
4548
4549         assert(c);
4550         assert(ret);
4551
4552         if (!c->numa_policy.nodes.set) {
4553                 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
4554                 return 0;
4555         }
4556
4557         r = numa_to_cpu_set(&c->numa_policy, &s);
4558         if (r < 0)
4559                 return r;
4560
4561         cpu_set_reset(ret);
4562
4563         return cpu_set_add_all(ret, &s);
4564 }
4565
4566 bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
4567         assert(c);
4568
4569         return c->cpu_affinity_from_numa;
4570 }
4571
4572 static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
4573         int r;
4574
4575         assert(fds);
4576         assert(n_fds);
4577         assert(*n_fds < fds_size);
4578         assert(ret_fd);
4579
4580         if (fd < 0) {
4581                 *ret_fd = -EBADF;
4582                 return 0;
4583         }
4584
4585         if (fd < 3 + (int) *n_fds) {
4586                 /* Let's move the fd up, so that it's outside of the fd range we will use to store
4587                  * the fds we pass to the process (or which are closed only during execve). */
4588
4589                 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
4590                 if (r < 0)
4591                         return -errno;
4592
4593                 close_and_replace(fd, r);
4594         }
4595
4596         *ret_fd = fds[*n_fds] = fd;
4597         (*n_fds) ++;
4598         return 1;
4599 }
4600
4601 static int connect_unix_harder(Unit *u, const OpenFile *of, int ofd) {
4602         union sockaddr_union addr = {
4603                 .un.sun_family = AF_UNIX,
4604         };
4605         socklen_t sa_len;
4606         static const int socket_types[] = { SOCK_DGRAM, SOCK_STREAM, SOCK_SEQPACKET };
4607         int r;
4608
4609         assert(u);
4610         assert(of);
4611         assert(ofd >= 0);
4612
4613         r = sockaddr_un_set_path(&addr.un, FORMAT_PROC_FD_PATH(ofd));
4614         if (r < 0)
4615                 return log_unit_error_errno(u, r, "Failed to set sockaddr for %s: %m", of->path);
4616
4617         sa_len = r;
4618
4619         for (size_t i = 0; i < ELEMENTSOF(socket_types); i++) {
4620                 _cleanup_close_ int fd = -EBADF;
4621
4622                 fd = socket(AF_UNIX, socket_types[i] | SOCK_CLOEXEC, 0);
4623                 if (fd < 0)
4624                         return log_unit_error_errno(u, errno, "Failed to create socket for %s: %m", of->path);
4625
4626                 r = RET_NERRNO(connect(fd, &addr.sa, sa_len));
4627                 if (r == -EPROTOTYPE)
4628                         continue;
4629                 if (r < 0)
4630                         return log_unit_error_errno(u, r, "Failed to connect socket for %s: %m", of->path);
4631
4632                 return TAKE_FD(fd);
4633         }
4634
4635         return log_unit_error_errno(u, SYNTHETIC_ERRNO(EPROTOTYPE), "Failed to connect socket for \"%s\".", of->path);
4636 }
4637
4638 static int get_open_file_fd(Unit *u, const OpenFile *of) {
4639         struct stat st;
4640         _cleanup_close_ int fd = -EBADF, ofd = -EBADF;
4641
4642         assert(u);
4643         assert(of);
4644
4645         ofd = open(of->path, O_PATH | O_CLOEXEC);
4646         if (ofd < 0)
4647                 return log_unit_error_errno(u, errno, "Could not open \"%s\": %m", of->path);
4648
4649         if (fstat(ofd, &st) < 0)
4650                 return log_unit_error_errno(u, errno, "Failed to stat %s: %m", of->path);
4651
4652         if (S_ISSOCK(st.st_mode)) {
4653                 fd = connect_unix_harder(u, of, ofd);
4654                 if (fd < 0)
4655                         return fd;
4656
4657                 if (FLAGS_SET(of->flags, OPENFILE_READ_ONLY) && shutdown(fd, SHUT_WR) < 0)
4658                         return log_unit_error_errno(u, errno, "Failed to shutdown send for socket %s: %m",
4659                                                     of->path);
4660
4661                 log_unit_debug(u, "socket %s opened (fd=%d)", of->path, fd);
4662         } else {
4663                 int flags = FLAGS_SET(of->flags, OPENFILE_READ_ONLY) ? O_RDONLY : O_RDWR;
4664                 if (FLAGS_SET(of->flags, OPENFILE_APPEND))
4665                         flags |= O_APPEND;
4666                 else if (FLAGS_SET(of->flags, OPENFILE_TRUNCATE))
4667                         flags |= O_TRUNC;
4668
4669                 fd = fd_reopen(ofd, flags | O_CLOEXEC);
4670                 if (fd < 0)
4671                         return log_unit_error_errno(u, fd, "Failed to open file %s: %m", of->path);
4672
4673                 log_unit_debug(u, "file %s opened (fd=%d)", of->path, fd);
4674         }
4675
4676         return TAKE_FD(fd);
4677 }
4678
4679 static int collect_open_file_fds(
4680                 Unit *u,
4681                 OpenFile* open_files,
4682                 int **fds,
4683                 char ***fdnames,
4684                 size_t *n_fds) {
4685         int r;
4686
4687         assert(u);
4688         assert(fds);
4689         assert(fdnames);
4690         assert(n_fds);
4691
4692         LIST_FOREACH(open_files, of, open_files) {
4693                 _cleanup_close_ int fd = -EBADF;
4694
4695                 fd = get_open_file_fd(u, of);
4696                 if (fd < 0) {
4697                         if (FLAGS_SET(of->flags, OPENFILE_GRACEFUL)) {
4698                                 log_unit_debug_errno(u, fd, "Failed to get OpenFile= file descriptor for %s, ignoring: %m", of->path);
4699                                 continue;
4700                         }
4701
4702                         return fd;
4703                 }
4704
4705                 if (!GREEDY_REALLOC(*fds, *n_fds + 1))
4706                         return -ENOMEM;
4707
4708                 r = strv_extend(fdnames, of->fdname);
4709                 if (r < 0)
4710                         return r;
4711
4712                 (*fds)[*n_fds] = TAKE_FD(fd);
4713
4714                 (*n_fds)++;
4715         }
4716
4717         return 0;
4718 }
4719
4720 static void log_command_line(Unit *unit, const char *msg, const char *executable, char **argv) {
4721         assert(unit);
4722         assert(msg);
4723         assert(executable);
4724
4725         if (!DEBUG_LOGGING)
4726                 return;
4727
4728         _cleanup_free_ char *cmdline = quote_command_line(argv, SHELL_ESCAPE_EMPTY);
4729
4730         log_unit_struct(unit, LOG_DEBUG,
4731                         "EXECUTABLE=%s", executable,
4732                         LOG_UNIT_MESSAGE(unit, "%s: %s", msg, strnull(cmdline)),
4733                         LOG_UNIT_INVOCATION_ID(unit));
4734 }
4735
4736 static bool exec_context_need_unprivileged_private_users(
4737                 const ExecContext *context,
4738                 const ExecParameters *params) {
4739
4740         assert(context);
4741         assert(params);
4742
4743         /* These options require PrivateUsers= when used in user units, as we need to be in a user namespace
4744          * to have permission to enable them when not running as root. If we have effective CAP_SYS_ADMIN
4745          * (system manager) then we have privileges and don't need this. */
4746         if (params->runtime_scope != RUNTIME_SCOPE_USER)
4747                 return false;
4748
4749         return context->private_users ||
4750                context->private_tmp ||
4751                context->private_devices ||
4752                context->private_network ||
4753                context->network_namespace_path ||
4754                context->private_ipc ||
4755                context->ipc_namespace_path ||
4756                context->private_mounts > 0 ||
4757                context->mount_apivfs ||
4758                context->n_bind_mounts > 0 ||
4759                context->n_temporary_filesystems > 0 ||
4760                context->root_directory ||
4761                !strv_isempty(context->extension_directories) ||
4762                context->protect_system != PROTECT_SYSTEM_NO ||
4763                context->protect_home != PROTECT_HOME_NO ||
4764                context->protect_kernel_tunables ||
4765                context->protect_kernel_modules ||
4766                context->protect_kernel_logs ||
4767                context->protect_control_groups ||
4768                context->protect_clock ||
4769                context->protect_hostname ||
4770                !strv_isempty(context->read_write_paths) ||
4771                !strv_isempty(context->read_only_paths) ||
4772                !strv_isempty(context->inaccessible_paths) ||
4773                !strv_isempty(context->exec_paths) ||
4774                !strv_isempty(context->no_exec_paths);
4775 }
4776
4777 static int exec_child(
4778                 Unit *unit,
4779                 const ExecCommand *command,
4780                 const ExecContext *context,
4781                 const ExecParameters *params,
4782                 ExecRuntime *runtime,
4783                 const CGroupContext *cgroup_context,
4784                 int socket_fd,
4785                 const int named_iofds[static 3],
4786                 int *params_fds,
4787                 size_t n_socket_fds,
4788                 size_t n_storage_fds,
4789                 char **files_env,
4790                 int user_lookup_fd,
4791                 int *exit_status) {
4792
4793         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
4794         int r, ngids = 0, exec_fd;
4795         _cleanup_free_ gid_t *supplementary_gids = NULL;
4796         const char *username = NULL, *groupname = NULL;
4797         _cleanup_free_ char *home_buffer = NULL, *memory_pressure_path = NULL;
4798         const char *home = NULL, *shell = NULL;
4799         char **final_argv = NULL;
4800         dev_t journal_stream_dev = 0;
4801         ino_t journal_stream_ino = 0;
4802         bool userns_set_up = false;
4803         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
4804                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
4805                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
4806                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
4807 #if HAVE_SELINUX
4808         _cleanup_free_ char *mac_selinux_context_net = NULL;
4809         bool use_selinux = false;
4810 #endif
4811 #if ENABLE_SMACK
4812         bool use_smack = false;
4813 #endif
4814 #if HAVE_APPARMOR
4815         bool use_apparmor = false;
4816 #endif
4817         uid_t saved_uid = getuid();
4818         gid_t saved_gid = getgid();
4819         uid_t uid = UID_INVALID;
4820         gid_t gid = GID_INVALID;
4821         size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
4822                n_keep_fds; /* total number of fds not to close */
4823         int secure_bits;
4824         _cleanup_free_ gid_t *gids_after_pam = NULL;
4825         int ngids_after_pam = 0;
4826         _cleanup_free_ int *fds = NULL;
4827         _cleanup_strv_free_ char **fdnames = NULL;
4828
4829         assert(unit);
4830         assert(command);
4831         assert(context);
4832         assert(params);
4833         assert(exit_status);
4834
4835         /* Explicitly test for CVE-2021-4034 inspired invocations */
4836         assert(command->path);
4837         assert(!strv_isempty(command->argv));
4838
4839         rename_process_from_path(command->path);
4840
4841         /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4842          * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4843          * both of which will be demoted to SIG_DFL. */
4844         (void) default_signals(SIGNALS_CRASH_HANDLER,
4845                                SIGNALS_IGNORE);
4846
4847         if (context->ignore_sigpipe)
4848                 (void) ignore_signals(SIGPIPE);
4849
4850         r = reset_signal_mask();
4851         if (r < 0) {
4852                 *exit_status = EXIT_SIGNAL_MASK;
4853                 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
4854         }
4855
4856         if (params->idle_pipe)
4857                 do_idle_pipe_dance(params->idle_pipe);
4858
4859         /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4860          * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4861          * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4862          * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
4863
4864         log_forget_fds();
4865         log_set_open_when_needed(true);
4866         log_settle_target();
4867
4868         /* In case anything used libc syslog(), close this here, too */
4869         closelog();
4870
4871         fds = newdup(int, params_fds, n_fds);
4872         if (!fds) {
4873                 *exit_status = EXIT_MEMORY;
4874                 return log_oom();
4875         }
4876
4877         fdnames = strv_copy((char**) params->fd_names);
4878         if (!fdnames) {
4879                 *exit_status = EXIT_MEMORY;
4880                 return log_oom();
4881         }
4882
4883         r = collect_open_file_fds(unit, params->open_files, &fds, &fdnames, &n_fds);
4884         if (r < 0) {
4885                 *exit_status = EXIT_FDS;
4886                 return log_unit_error_errno(unit, r, "Failed to get OpenFile= file descriptors: %m");
4887         }
4888
4889         int keep_fds[n_fds + 3];
4890         memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
4891         n_keep_fds = n_fds;
4892
4893         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
4894         if (r < 0) {
4895                 *exit_status = EXIT_FDS;
4896                 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4897         }
4898
4899 #if HAVE_LIBBPF
4900         if (unit->manager->restrict_fs) {
4901                 int bpf_map_fd = lsm_bpf_map_restrict_fs_fd(unit);
4902                 if (bpf_map_fd < 0) {
4903                         *exit_status = EXIT_FDS;
4904                         return log_unit_error_errno(unit, bpf_map_fd, "Failed to get restrict filesystems BPF map fd: %m");
4905                 }
4906
4907                 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, bpf_map_fd, &bpf_map_fd);
4908                 if (r < 0) {
4909                         *exit_status = EXIT_FDS;
4910                         return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4911                 }
4912         }
4913 #endif
4914
4915         r = close_remaining_fds(params, runtime, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
4916         if (r < 0) {
4917                 *exit_status = EXIT_FDS;
4918                 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
4919         }
4920
4921         if (!context->same_pgrp &&
4922             setsid() < 0) {
4923                 *exit_status = EXIT_SETSID;
4924                 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
4925         }
4926
4927         exec_context_tty_reset(context, params);
4928
4929         if (unit_shall_confirm_spawn(unit)) {
4930                 _cleanup_free_ char *cmdline = NULL;
4931
4932                 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
4933                 if (!cmdline) {
4934                         *exit_status = EXIT_MEMORY;
4935                         return log_oom();
4936                 }
4937
4938                 r = ask_for_confirmation(context, params->confirm_spawn, unit, cmdline);
4939                 if (r != CONFIRM_EXECUTE) {
4940                         if (r == CONFIRM_PRETEND_SUCCESS) {
4941                                 *exit_status = EXIT_SUCCESS;
4942                                 return 0;
4943                         }
4944                         *exit_status = EXIT_CONFIRM;
4945                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
4946                                                     "Execution cancelled by the user");
4947                 }
4948         }
4949
4950         /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4951          * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4952          * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4953          * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4954          * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4955         if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
4956             setenv("SYSTEMD_ACTIVATION_SCOPE", runtime_scope_to_string(params->runtime_scope), true) != 0) {
4957                 *exit_status = EXIT_MEMORY;
4958                 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4959         }
4960
4961         if (context->dynamic_user && runtime && runtime->dynamic_creds) {
4962                 _cleanup_strv_free_ char **suggested_paths = NULL;
4963
4964                 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
4965                  * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
4966                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4967                         *exit_status = EXIT_USER;
4968                         return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4969                 }
4970
4971                 r = compile_suggested_paths(context, params, &suggested_paths);
4972                 if (r < 0) {
4973                         *exit_status = EXIT_MEMORY;
4974                         return log_oom();
4975                 }
4976
4977                 r = dynamic_creds_realize(runtime->dynamic_creds, suggested_paths, &uid, &gid);
4978                 if (r < 0) {
4979                         *exit_status = EXIT_USER;
4980                         if (r == -EILSEQ)
4981                                 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4982                                                             "Failed to update dynamic user credentials: User or group with specified name already exists.");
4983                         return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
4984                 }
4985
4986                 if (!uid_is_valid(uid)) {
4987                         *exit_status = EXIT_USER;
4988                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
4989                 }
4990
4991                 if (!gid_is_valid(gid)) {
4992                         *exit_status = EXIT_USER;
4993                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
4994                 }
4995
4996                 if (runtime->dynamic_creds->user)
4997                         username = runtime->dynamic_creds->user->name;
4998
4999         } else {
5000                 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
5001                 if (r < 0) {
5002                         *exit_status = EXIT_USER;
5003                         return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
5004                 }
5005
5006                 r = get_fixed_group(context, &groupname, &gid);
5007                 if (r < 0) {
5008                         *exit_status = EXIT_GROUP;
5009                         return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
5010                 }
5011         }
5012
5013         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
5014         r = get_supplementary_groups(context, username, groupname, gid,
5015                                      &supplementary_gids, &ngids);
5016         if (r < 0) {
5017                 *exit_status = EXIT_GROUP;
5018                 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
5019         }
5020
5021         r = send_user_lookup(unit, user_lookup_fd, uid, gid);
5022         if (r < 0) {
5023                 *exit_status = EXIT_USER;
5024                 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
5025         }
5026
5027         user_lookup_fd = safe_close(user_lookup_fd);
5028
5029         r = acquire_home(context, uid, &home, &home_buffer);
5030         if (r < 0) {
5031                 *exit_status = EXIT_CHDIR;
5032                 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
5033         }
5034
5035         /* If a socket is connected to STDIN/STDOUT/STDERR, we must drop O_NONBLOCK */
5036         if (socket_fd >= 0)
5037                 (void) fd_nonblock(socket_fd, false);
5038
5039         /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
5040          * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
5041         if (params->cgroup_path) {
5042                 _cleanup_free_ char *p = NULL;
5043
5044                 r = exec_parameters_get_cgroup_path(params, cgroup_context, &p);
5045                 if (r < 0) {
5046                         *exit_status = EXIT_CGROUP;
5047                         return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
5048                 }
5049
5050                 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
5051                 if (r == -EUCLEAN) {
5052                         *exit_status = EXIT_CGROUP;
5053                         return log_unit_error_errno(unit, r, "Failed to attach process to cgroup %s "
5054                                                     "because the cgroup or one of its parents or "
5055                                                     "siblings is in the threaded mode: %m", p);
5056                 }
5057                 if (r < 0) {
5058                         *exit_status = EXIT_CGROUP;
5059                         return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
5060                 }
5061         }
5062
5063         if (context->network_namespace_path && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
5064                 r = open_shareable_ns_path(runtime->shared->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
5065                 if (r < 0) {
5066                         *exit_status = EXIT_NETWORK;
5067                         return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
5068                 }
5069         }
5070
5071         if (context->ipc_namespace_path && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
5072                 r = open_shareable_ns_path(runtime->shared->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
5073                 if (r < 0) {
5074                         *exit_status = EXIT_NAMESPACE;
5075                         return log_unit_error_errno(unit, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
5076                 }
5077         }
5078
5079         r = setup_input(context, params, socket_fd, named_iofds);
5080         if (r < 0) {
5081                 *exit_status = EXIT_STDIN;
5082                 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
5083         }
5084
5085         r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
5086         if (r < 0) {
5087                 *exit_status = EXIT_STDOUT;
5088                 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
5089         }
5090
5091         r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
5092         if (r < 0) {
5093                 *exit_status = EXIT_STDERR;
5094                 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
5095         }
5096
5097         if (context->oom_score_adjust_set) {
5098                 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
5099                  * prohibit write access to this file, and we shouldn't trip up over that. */
5100                 r = set_oom_score_adjust(context->oom_score_adjust);
5101                 if (r < 0) {
5102                         if (ERRNO_IS_PRIVILEGE(r))
5103                                 log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
5104                         else {
5105                                 *exit_status = EXIT_OOM_ADJUST;
5106                                 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
5107                         }
5108                 }
5109         }
5110
5111         if (context->coredump_filter_set) {
5112                 r = set_coredump_filter(context->coredump_filter);
5113                 if (r < 0) {
5114                         if (ERRNO_IS_PRIVILEGE(r))
5115                                 log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
5116                         else
5117                                 return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
5118                 }
5119         }
5120
5121         if (context->nice_set) {
5122                 r = setpriority_closest(context->nice);
5123                 if (r < 0)
5124                         return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
5125         }
5126
5127         if (context->cpu_sched_set) {
5128                 struct sched_param param = {
5129                         .sched_priority = context->cpu_sched_priority,
5130                 };
5131
5132                 r = sched_setscheduler(0,
5133                                        context->cpu_sched_policy |
5134                                        (context->cpu_sched_reset_on_fork ?
5135                                         SCHED_RESET_ON_FORK : 0),
5136                                        &param);
5137                 if (r < 0) {
5138                         *exit_status = EXIT_SETSCHEDULER;
5139                         return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
5140                 }
5141         }
5142
5143         if (context->cpu_affinity_from_numa || context->cpu_set.set) {
5144                 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
5145                 const CPUSet *cpu_set;
5146
5147                 if (context->cpu_affinity_from_numa) {
5148                         r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
5149                         if (r < 0) {
5150                                 *exit_status = EXIT_CPUAFFINITY;
5151                                 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
5152                         }
5153
5154                         cpu_set = &converted_cpu_set;
5155                 } else
5156                         cpu_set = &context->cpu_set;
5157
5158                 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
5159                         *exit_status = EXIT_CPUAFFINITY;
5160                         return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
5161                 }
5162         }
5163
5164         if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
5165                 r = apply_numa_policy(&context->numa_policy);
5166                 if (r < 0) {
5167                         if (ERRNO_IS_NOT_SUPPORTED(r))
5168                                 log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
5169                         else {
5170                                 *exit_status = EXIT_NUMA_POLICY;
5171                                 return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
5172                         }
5173                 }
5174         }
5175
5176         if (context->ioprio_set)
5177                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
5178                         *exit_status = EXIT_IOPRIO;
5179                         return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
5180                 }
5181
5182         if (context->timer_slack_nsec != NSEC_INFINITY)
5183                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
5184                         *exit_status = EXIT_TIMERSLACK;
5185                         return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
5186                 }
5187
5188         if (context->personality != PERSONALITY_INVALID) {
5189                 r = safe_personality(context->personality);
5190                 if (r < 0) {
5191                         *exit_status = EXIT_PERSONALITY;
5192                         return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
5193                 }
5194         }
5195
5196         if (context->utmp_id) {
5197                 const char *line = context->tty_path ?
5198                         (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
5199                         NULL;
5200                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
5201                                       line,
5202                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
5203                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
5204                                       USER_PROCESS,
5205                                       username);
5206         }
5207
5208         if (uid_is_valid(uid)) {
5209                 r = chown_terminal(STDIN_FILENO, uid);
5210                 if (r < 0) {
5211                         *exit_status = EXIT_STDIN;
5212                         return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
5213                 }
5214         }
5215
5216         if (params->cgroup_path) {
5217                 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
5218                  * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
5219                  * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
5220                  * touch a single hierarchy too. */
5221
5222                 if (params->flags & EXEC_CGROUP_DELEGATE) {
5223                         _cleanup_free_ char *p = NULL;
5224
5225                         r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
5226                         if (r < 0) {
5227                                 *exit_status = EXIT_CGROUP;
5228                                 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
5229                         }
5230
5231                         r = exec_parameters_get_cgroup_path(params, cgroup_context, &p);
5232                         if (r < 0) {
5233                                 *exit_status = EXIT_CGROUP;
5234                                 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
5235                         }
5236                         if (r > 0) {
5237                                 r = cg_set_access_recursive(SYSTEMD_CGROUP_CONTROLLER, p, uid, gid);
5238                                 if (r < 0) {
5239                                         *exit_status = EXIT_CGROUP;
5240                                         return log_unit_error_errno(unit, r, "Failed to adjust control subgroup access: %m");
5241                                 }
5242                         }
5243                 }
5244
5245                 if (cgroup_context && cg_unified() > 0 && is_pressure_supported() > 0) {
5246                         if (cgroup_context_want_memory_pressure(cgroup_context)) {
5247                                 r = cg_get_path("memory", params->cgroup_path, "memory.pressure", &memory_pressure_path);
5248                                 if (r < 0) {
5249                                         *exit_status = EXIT_MEMORY;
5250                                         return log_oom();
5251                                 }
5252
5253                                 r = chmod_and_chown(memory_pressure_path, 0644, uid, gid);
5254                                 if (r < 0) {
5255                                         log_unit_full_errno(unit, r == -ENOENT || ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
5256                                                             "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path);
5257                                         memory_pressure_path = mfree(memory_pressure_path);
5258                                 }
5259                         } else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_OFF) {
5260                                 memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
5261                                 if (!memory_pressure_path) {
5262                                         *exit_status = EXIT_MEMORY;
5263                                         return log_oom();
5264                                 }
5265                         }
5266                 }
5267         }
5268
5269         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
5270
5271         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
5272                 r = setup_exec_directory(unit, context, params, uid, gid, dt, needs_mount_namespace, exit_status);
5273                 if (r < 0)
5274                         return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
5275         }
5276
5277         if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
5278                 r = setup_credentials(context, params, unit->id, uid);
5279                 if (r < 0) {
5280                         *exit_status = EXIT_CREDENTIALS;
5281                         return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
5282                 }
5283         }
5284
5285         r = build_environment(
5286                         unit,
5287                         context,
5288                         params,
5289                         cgroup_context,
5290                         n_fds,
5291                         fdnames,
5292                         home,
5293                         username,
5294                         shell,
5295                         journal_stream_dev,
5296                         journal_stream_ino,
5297                         memory_pressure_path,
5298                         &our_env);
5299         if (r < 0) {
5300                 *exit_status = EXIT_MEMORY;
5301                 return log_oom();
5302         }
5303
5304         r = build_pass_environment(context, &pass_env);
5305         if (r < 0) {
5306                 *exit_status = EXIT_MEMORY;
5307                 return log_oom();
5308         }
5309
5310         /* The $PATH variable is set to the default path in params->environment. However, this is overridden
5311          * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
5312          * not specify PATH but the unit has ExecSearchPath. */
5313         if (!strv_isempty(context->exec_search_path)) {
5314                 _cleanup_free_ char *joined = NULL;
5315
5316                 joined = strv_join(context->exec_search_path, ":");
5317                 if (!joined) {
5318                         *exit_status = EXIT_MEMORY;
5319                         return log_oom();
5320                 }
5321
5322                 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
5323                 if (r < 0) {
5324                         *exit_status = EXIT_MEMORY;
5325                         return log_oom();
5326                 }
5327         }
5328
5329         accum_env = strv_env_merge(params->environment,
5330                                    our_env,
5331                                    joined_exec_search_path,
5332                                    pass_env,
5333                                    context->environment,
5334                                    files_env);
5335         if (!accum_env) {
5336                 *exit_status = EXIT_MEMORY;
5337                 return log_oom();
5338         }
5339         accum_env = strv_env_clean(accum_env);
5340
5341         (void) umask(context->umask);
5342
5343         r = setup_keyring(unit, context, params, uid, gid);
5344         if (r < 0) {
5345                 *exit_status = EXIT_KEYRING;
5346                 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
5347         }
5348
5349         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
5350          * from it. */
5351         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
5352
5353         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
5354          * for it, and the kernel doesn't actually support ambient caps. */
5355         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
5356
5357         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
5358          * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
5359          * desired. */
5360         if (needs_ambient_hack)
5361                 needs_setuid = false;
5362         else
5363                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
5364
5365         uint64_t capability_ambient_set = context->capability_ambient_set;
5366
5367         if (needs_sandboxing) {
5368                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
5369                  * /sys being present. The actual MAC context application will happen later, as late as
5370                  * possible, to avoid impacting our own code paths. */
5371
5372 #if HAVE_SELINUX
5373                 use_selinux = mac_selinux_use();
5374 #endif
5375 #if ENABLE_SMACK
5376                 use_smack = mac_smack_use();
5377 #endif
5378 #if HAVE_APPARMOR
5379                 use_apparmor = mac_apparmor_use();
5380 #endif
5381         }
5382
5383         if (needs_sandboxing) {
5384                 int which_failed;
5385
5386                 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
5387                  * is set here. (See below.) */
5388
5389                 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
5390                 if (r < 0) {
5391                         *exit_status = EXIT_LIMITS;
5392                         return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
5393                 }
5394         }
5395
5396         if (needs_setuid && context->pam_name && username) {
5397                 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
5398                  * wins here. (See above.) */
5399
5400                 /* All fds passed in the fds array will be closed in the pam child process. */
5401                 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
5402                 if (r < 0) {
5403                         *exit_status = EXIT_PAM;
5404                         return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
5405                 }
5406
5407                 if (ambient_capabilities_supported()) {
5408                         uint64_t ambient_after_pam;
5409
5410                         /* PAM modules might have set some ambient caps. Query them here and merge them into
5411                          * the caps we want to set in the end, so that we don't end up unsetting them. */
5412                         r = capability_get_ambient(&ambient_after_pam);
5413                         if (r < 0) {
5414                                 *exit_status = EXIT_CAPABILITIES;
5415                                 return log_unit_error_errno(unit, r, "Failed to query ambient caps: %m");
5416                         }
5417
5418                         capability_ambient_set |= ambient_after_pam;
5419                 }
5420
5421                 ngids_after_pam = getgroups_alloc(&gids_after_pam);
5422                 if (ngids_after_pam < 0) {
5423                         *exit_status = EXIT_MEMORY;
5424                         return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
5425                 }
5426         }
5427
5428         if (needs_sandboxing && exec_context_need_unprivileged_private_users(context, params)) {
5429                 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
5430                  * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
5431                  * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
5432
5433                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
5434                 /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
5435                  * the actual requested operations fail (or silently continue). */
5436                 if (r < 0 && context->private_users) {
5437                         *exit_status = EXIT_USER;
5438                         return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
5439                 }
5440                 if (r < 0)
5441                         log_unit_info_errno(unit, r, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
5442                 else
5443                         userns_set_up = true;
5444         }
5445
5446         if (exec_needs_network_namespace(context) && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
5447
5448                 /* Try to enable network namespacing if network namespacing is available and we have
5449                  * CAP_NET_ADMIN. We need CAP_NET_ADMIN to be able to configure the loopback device in the
5450                  * new network namespace. And if we don't have that, then we could only create a network
5451                  * namespace without the ability to set up "lo". Hence gracefully skip things then. */
5452                 if (ns_type_supported(NAMESPACE_NET) && have_effective_cap(CAP_NET_ADMIN) > 0) {
5453                         r = setup_shareable_ns(runtime->shared->netns_storage_socket, CLONE_NEWNET);
5454                         if (r < 0) {
5455                                 if (ERRNO_IS_PRIVILEGE(r))
5456                                         log_unit_notice_errno(unit, r,
5457                                                                "PrivateNetwork=yes is configured, but network namespace setup not permitted, proceeding without: %m");
5458                                 else {
5459                                         *exit_status = EXIT_NETWORK;
5460                                         return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
5461                                 }
5462                         }
5463                 } else if (context->network_namespace_path) {
5464                         *exit_status = EXIT_NETWORK;
5465                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
5466                                                     "NetworkNamespacePath= is not supported, refusing.");
5467                 } else
5468                         log_unit_notice(unit, "PrivateNetwork=yes is configured, but the kernel does not support or we lack privileges for network namespace, proceeding without.");
5469         }
5470
5471         if (exec_needs_ipc_namespace(context) && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
5472
5473                 if (ns_type_supported(NAMESPACE_IPC)) {
5474                         r = setup_shareable_ns(runtime->shared->ipcns_storage_socket, CLONE_NEWIPC);
5475                         if (r == -EPERM)
5476                                 log_unit_warning_errno(unit, r,
5477                                                        "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
5478                         else if (r < 0) {
5479                                 *exit_status = EXIT_NAMESPACE;
5480                                 return log_unit_error_errno(unit, r, "Failed to set up IPC namespacing: %m");
5481                         }
5482                 } else if (context->ipc_namespace_path) {
5483                         *exit_status = EXIT_NAMESPACE;
5484                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
5485                                                     "IPCNamespacePath= is not supported, refusing.");
5486                 } else
5487                         log_unit_warning(unit, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
5488         }
5489
5490         if (needs_mount_namespace) {
5491                 _cleanup_free_ char *error_path = NULL;
5492
5493                 r = apply_mount_namespace(unit, command->flags, context, params, runtime, memory_pressure_path, &error_path);
5494                 if (r < 0) {
5495                         *exit_status = EXIT_NAMESPACE;
5496                         return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
5497                                                     error_path ? ": " : "", strempty(error_path));
5498                 }
5499         }
5500
5501         if (needs_sandboxing) {
5502                 r = apply_protect_hostname(unit, context, exit_status);
5503                 if (r < 0)
5504                         return r;
5505         }
5506
5507         if (context->memory_ksm >= 0)
5508                 if (prctl(PR_SET_MEMORY_MERGE, context->memory_ksm) < 0) {
5509                         if (ERRNO_IS_NOT_SUPPORTED(errno))
5510                                 log_unit_debug_errno(unit, errno, "KSM support not available, ignoring.");
5511                         else {
5512                                 *exit_status = EXIT_KSM;
5513                                 return log_unit_error_errno(unit, errno, "Failed to set KSM: %m");
5514                         }
5515                 }
5516
5517         /* Drop groups as early as possible.
5518          * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
5519          * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
5520         if (needs_setuid) {
5521                 _cleanup_free_ gid_t *gids_to_enforce = NULL;
5522                 int ngids_to_enforce = 0;
5523
5524                 ngids_to_enforce = merge_gid_lists(supplementary_gids,
5525                                                    ngids,
5526                                                    gids_after_pam,
5527                                                    ngids_after_pam,
5528                                                    &gids_to_enforce);
5529                 if (ngids_to_enforce < 0) {
5530                         *exit_status = EXIT_MEMORY;
5531                         return log_unit_error_errno(unit,
5532                                                     ngids_to_enforce,
5533                                                     "Failed to merge group lists. Group membership might be incorrect: %m");
5534                 }
5535
5536                 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
5537                 if (r < 0) {
5538                         *exit_status = EXIT_GROUP;
5539                         return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
5540                 }
5541         }
5542
5543         /* If the user namespace was not set up above, try to do it now.
5544          * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
5545          * restricted by rules pertaining to combining user namespaces with other namespaces (e.g. in the
5546          * case of mount namespaces being less privileged when the mount point list is copied from a
5547          * different user namespace). */
5548
5549         if (needs_sandboxing && context->private_users && !userns_set_up) {
5550                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
5551                 if (r < 0) {
5552                         *exit_status = EXIT_USER;
5553                         return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
5554                 }
5555         }
5556
5557         /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
5558          * shall execute. */
5559
5560         _cleanup_free_ char *executable = NULL;
5561         _cleanup_close_ int executable_fd = -EBADF;
5562         r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
5563         if (r < 0) {
5564                 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
5565                         log_unit_struct_errno(unit, LOG_INFO, r,
5566                                               "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5567                                               LOG_UNIT_INVOCATION_ID(unit),
5568                                               LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
5569                                                                command->path),
5570                                               "EXECUTABLE=%s", command->path);
5571                         return 0;
5572                 }
5573
5574                 *exit_status = EXIT_EXEC;
5575
5576                 return log_unit_struct_errno(unit, LOG_INFO, r,
5577                                              "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5578                                              LOG_UNIT_INVOCATION_ID(unit),
5579                                              LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
5580                                                               command->path),
5581                                              "EXECUTABLE=%s", command->path);
5582         }
5583
5584         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
5585         if (r < 0) {
5586                 *exit_status = EXIT_FDS;
5587                 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
5588         }
5589
5590 #if HAVE_SELINUX
5591         if (needs_sandboxing && use_selinux && params->selinux_context_net) {
5592                 int fd = -EBADF;
5593
5594                 if (socket_fd >= 0)
5595                         fd = socket_fd;
5596                 else if (params->n_socket_fds == 1)
5597                         /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
5598                          * use context from that fd to compute the label. */
5599                         fd = params->fds[0];
5600
5601                 if (fd >= 0) {
5602                         r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
5603                         if (r < 0) {
5604                                 if (!context->selinux_context_ignore) {
5605                                         *exit_status = EXIT_SELINUX_CONTEXT;
5606                                         return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
5607                                 }
5608                                 log_unit_debug_errno(unit, r, "Failed to determine SELinux context, ignoring: %m");
5609                         }
5610                 }
5611         }
5612 #endif
5613
5614         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that
5615          * we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any
5616          * more. We do keep exec_fd however, if we have it, since we need to keep it open until the final
5617          * execve(). */
5618
5619         r = close_all_fds(keep_fds, n_keep_fds);
5620         if (r >= 0)
5621                 r = shift_fds(fds, n_fds);
5622         if (r >= 0)
5623                 r = flags_fds(fds, n_socket_fds, n_fds, context->non_blocking);
5624         if (r < 0) {
5625                 *exit_status = EXIT_FDS;
5626                 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
5627         }
5628
5629         /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
5630          * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
5631          * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
5632          * came this far. */
5633
5634         secure_bits = context->secure_bits;
5635
5636         if (needs_sandboxing) {
5637                 uint64_t bset;
5638
5639                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested.
5640                  * (Note this is placed after the general resource limit initialization, see above, in order
5641                  * to take precedence.) */
5642                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
5643                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
5644                                 *exit_status = EXIT_LIMITS;
5645                                 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
5646                         }
5647                 }
5648
5649 #if ENABLE_SMACK
5650                 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
5651                  * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
5652                 if (use_smack) {
5653                         r = setup_smack(unit->manager, context, executable_fd);
5654                         if (r < 0 && !context->smack_process_label_ignore) {
5655                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
5656                                 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
5657                         }
5658                 }
5659 #endif
5660
5661                 bset = context->capability_bounding_set;
5662                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
5663                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
5664                  * instead of us doing that */
5665                 if (needs_ambient_hack)
5666                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
5667                                 (UINT64_C(1) << CAP_SETUID) |
5668                                 (UINT64_C(1) << CAP_SETGID);
5669
5670                 if (!cap_test_all(bset)) {
5671                         r = capability_bounding_set_drop(bset, /* right_now= */ false);
5672                         if (r < 0) {
5673                                 *exit_status = EXIT_CAPABILITIES;
5674                                 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
5675                         }
5676                 }
5677
5678                 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
5679                  * keep-caps set.
5680                  *
5681                  * To be able to raise the ambient capabilities after setresuid() they have to be added to
5682                  * the inherited set and keep caps has to be set (done in enforce_user()).  After setresuid()
5683                  * the ambient capabilities can be raised as they are present in the permitted and
5684                  * inhertiable set. However it is possible that someone wants to set ambient capabilities
5685                  * without changing the user, so we also set the ambient capabilities here.
5686                  *
5687                  * The requested ambient capabilities are raised in the inheritable set if the second
5688                  * argument is true. */
5689                 if (!needs_ambient_hack) {
5690                         r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ true);
5691                         if (r < 0) {
5692                                 *exit_status = EXIT_CAPABILITIES;
5693                                 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
5694                         }
5695                 }
5696         }
5697
5698         /* chroot to root directory first, before we lose the ability to chroot */
5699         r = apply_root_directory(context, params, runtime, needs_mount_namespace, exit_status);
5700         if (r < 0)
5701                 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
5702
5703         if (needs_setuid) {
5704                 if (uid_is_valid(uid)) {
5705                         r = enforce_user(context, uid, capability_ambient_set);
5706                         if (r < 0) {
5707                                 *exit_status = EXIT_USER;
5708                                 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
5709                         }
5710
5711                         if (!needs_ambient_hack && capability_ambient_set != 0) {
5712
5713                                 /* Raise the ambient capabilities after user change. */
5714                                 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ false);
5715                                 if (r < 0) {
5716                                         *exit_status = EXIT_CAPABILITIES;
5717                                         return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
5718                                 }
5719                         }
5720                 }
5721         }
5722
5723         /* Apply working directory here, because the working directory might be on NFS and only the user running
5724          * this service might have the correct privilege to change to the working directory */
5725         r = apply_working_directory(context, params, runtime, home, exit_status);
5726         if (r < 0)
5727                 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
5728
5729         if (needs_sandboxing) {
5730                 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
5731                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
5732                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
5733                  * are restricted. */
5734
5735 #if HAVE_SELINUX
5736                 if (use_selinux) {
5737                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
5738
5739                         if (exec_context) {
5740                                 r = setexeccon(exec_context);
5741                                 if (r < 0) {
5742                                         if (!context->selinux_context_ignore) {
5743                                                 *exit_status = EXIT_SELINUX_CONTEXT;
5744                                                 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
5745                                         }
5746                                         log_unit_debug_errno(unit, r, "Failed to change SELinux context to %s, ignoring: %m", exec_context);
5747                                 }
5748                         }
5749                 }
5750 #endif
5751
5752 #if HAVE_APPARMOR
5753                 if (use_apparmor && context->apparmor_profile) {
5754                         r = aa_change_onexec(context->apparmor_profile);
5755                         if (r < 0 && !context->apparmor_profile_ignore) {
5756                                 *exit_status = EXIT_APPARMOR_PROFILE;
5757                                 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
5758                         }
5759                 }
5760 #endif
5761
5762                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential
5763                  * EPERMs we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits
5764                  * requires CAP_SETPCAP. */
5765                 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
5766                         /* CAP_SETPCAP is required to set securebits. This capability is raised into the
5767                          * effective set here.
5768                          *
5769                          * The effective set is overwritten during execve() with the following values:
5770                          *
5771                          * - ambient set (for non-root processes)
5772                          *
5773                          * - (inheritable | bounding) set for root processes)
5774                          *
5775                          * Hence there is no security impact to raise it in the effective set before execve
5776                          */
5777                         r = capability_gain_cap_setpcap(/* return_caps= */ NULL);
5778                         if (r < 0) {
5779                                 *exit_status = EXIT_CAPABILITIES;
5780                                 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
5781                         }
5782                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
5783                                 *exit_status = EXIT_SECUREBITS;
5784                                 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
5785                         }
5786                 }
5787
5788                 if (context_has_no_new_privileges(context))
5789                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
5790                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
5791                                 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
5792                         }
5793
5794 #if HAVE_SECCOMP
5795                 r = apply_address_families(unit, context);
5796                 if (r < 0) {
5797                         *exit_status = EXIT_ADDRESS_FAMILIES;
5798                         return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
5799                 }
5800
5801                 r = apply_memory_deny_write_execute(unit, context);
5802                 if (r < 0) {
5803                         *exit_status = EXIT_SECCOMP;
5804                         return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
5805                 }
5806
5807                 r = apply_restrict_realtime(unit, context);
5808                 if (r < 0) {
5809                         *exit_status = EXIT_SECCOMP;
5810                         return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
5811                 }
5812
5813                 r = apply_restrict_suid_sgid(unit, context);
5814                 if (r < 0) {
5815                         *exit_status = EXIT_SECCOMP;
5816                         return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
5817                 }
5818
5819                 r = apply_restrict_namespaces(unit, context);
5820                 if (r < 0) {
5821                         *exit_status = EXIT_SECCOMP;
5822                         return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
5823                 }
5824
5825                 r = apply_protect_sysctl(unit, context);
5826                 if (r < 0) {
5827                         *exit_status = EXIT_SECCOMP;
5828                         return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
5829                 }
5830
5831                 r = apply_protect_kernel_modules(unit, context);
5832                 if (r < 0) {
5833                         *exit_status = EXIT_SECCOMP;
5834                         return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
5835                 }
5836
5837                 r = apply_protect_kernel_logs(unit, context);
5838                 if (r < 0) {
5839                         *exit_status = EXIT_SECCOMP;
5840                         return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
5841                 }
5842
5843                 r = apply_protect_clock(unit, context);
5844                 if (r < 0) {
5845                         *exit_status = EXIT_SECCOMP;
5846                         return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
5847                 }
5848
5849                 r = apply_private_devices(unit, context);
5850                 if (r < 0) {
5851                         *exit_status = EXIT_SECCOMP;
5852                         return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
5853                 }
5854
5855                 r = apply_syscall_archs(unit, context);
5856                 if (r < 0) {
5857                         *exit_status = EXIT_SECCOMP;
5858                         return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
5859                 }
5860
5861                 r = apply_lock_personality(unit, context);
5862                 if (r < 0) {
5863                         *exit_status = EXIT_SECCOMP;
5864                         return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
5865                 }
5866
5867                 r = apply_syscall_log(unit, context);
5868                 if (r < 0) {
5869                         *exit_status = EXIT_SECCOMP;
5870                         return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
5871                 }
5872
5873                 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
5874                  * by the filter as little as possible. */
5875                 r = apply_syscall_filter(unit, context, needs_ambient_hack);
5876                 if (r < 0) {
5877                         *exit_status = EXIT_SECCOMP;
5878                         return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
5879                 }
5880 #endif
5881
5882 #if HAVE_LIBBPF
5883                 r = apply_restrict_filesystems(unit, context);
5884                 if (r < 0) {
5885                         *exit_status = EXIT_BPF;
5886                         return log_unit_error_errno(unit, r, "Failed to restrict filesystems: %m");
5887                 }
5888 #endif
5889
5890         }
5891
5892         if (!strv_isempty(context->unset_environment)) {
5893                 char **ee = NULL;
5894
5895                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
5896                 if (!ee) {
5897                         *exit_status = EXIT_MEMORY;
5898                         return log_oom();
5899                 }
5900
5901                 strv_free_and_replace(accum_env, ee);
5902         }
5903
5904         if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
5905                 _cleanup_strv_free_ char **unset_variables = NULL, **bad_variables = NULL;
5906
5907                 r = replace_env_argv(command->argv, accum_env, &replaced_argv, &unset_variables, &bad_variables);
5908                 if (r < 0) {
5909                         *exit_status = EXIT_MEMORY;
5910                         return log_unit_error_errno(unit, r, "Failed to replace environment variables: %m");
5911                 }
5912                 final_argv = replaced_argv;
5913
5914                 if (!strv_isempty(unset_variables)) {
5915                         _cleanup_free_ char *ju = strv_join(unset_variables, ", ");
5916                         log_unit_warning(unit, "Referenced but unset environment variable evaluates to an empty string: %s", strna(ju));
5917                 }
5918
5919                 if (!strv_isempty(bad_variables)) {
5920                         _cleanup_free_ char *jb = strv_join(bad_variables, ", ");
5921                         log_unit_warning(unit, "Invalid environment variable name evaluates to an empty string: %s", strna(jb));;
5922                 }
5923         } else
5924                 final_argv = command->argv;
5925
5926         log_command_line(unit, "Executing", executable, final_argv);
5927
5928         if (exec_fd >= 0) {
5929                 uint8_t hot = 1;
5930
5931                 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5932                  * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5933
5934                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5935                         *exit_status = EXIT_EXEC;
5936                         return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
5937                 }
5938         }
5939
5940         r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5941
5942         if (exec_fd >= 0) {
5943                 uint8_t hot = 0;
5944
5945                 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5946                  * that POLLHUP on it no longer means execve() succeeded. */
5947
5948                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5949                         *exit_status = EXIT_EXEC;
5950                         return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
5951                 }
5952         }
5953
5954         *exit_status = EXIT_EXEC;
5955         return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
5956 }
5957
5958 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
5959 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
5960
5961 int exec_spawn(Unit *unit,
5962                ExecCommand *command,
5963                const ExecContext *context,
5964                const ExecParameters *params,
5965                ExecRuntime *runtime,
5966                const CGroupContext *cgroup_context,
5967                pid_t *ret) {
5968
5969         int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
5970         _cleanup_free_ char *subcgroup_path = NULL;
5971         _cleanup_strv_free_ char **files_env = NULL;
5972         size_t n_storage_fds = 0, n_socket_fds = 0;
5973         pid_t pid;
5974
5975         assert(unit);
5976         assert(command);
5977         assert(context);
5978         assert(ret);
5979         assert(params);
5980         assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
5981
5982         LOG_CONTEXT_PUSH_UNIT(unit);
5983
5984         if (context->std_input == EXEC_INPUT_SOCKET ||
5985             context->std_output == EXEC_OUTPUT_SOCKET ||
5986             context->std_error == EXEC_OUTPUT_SOCKET) {
5987
5988                 if (params->n_socket_fds > 1)
5989                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
5990
5991                 if (params->n_socket_fds == 0)
5992                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
5993
5994                 socket_fd = params->fds[0];
5995         } else {
5996                 socket_fd = -EBADF;
5997                 fds = params->fds;
5998                 n_socket_fds = params->n_socket_fds;
5999                 n_storage_fds = params->n_storage_fds;
6000         }
6001
6002         r = exec_context_named_iofds(context, params, named_iofds);
6003         if (r < 0)
6004                 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
6005
6006         r = exec_context_load_environment(unit, context, &files_env);
6007         if (r < 0)
6008                 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
6009
6010         /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
6011            and, until the next SELinux policy changes, we save further reloads in future children. */
6012         mac_selinux_maybe_reload();
6013
6014         /* We won't know the real executable path until we create the mount namespace in the child, but we
6015            want to log from the parent, so we use the possibly inaccurate path here. */
6016         log_command_line(unit, "About to execute", command->path, command->argv);
6017
6018         if (params->cgroup_path) {
6019                 r = exec_parameters_get_cgroup_path(params, cgroup_context, &subcgroup_path);
6020                 if (r < 0)
6021                         return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
6022                 if (r > 0) {
6023                         /* If there's a subcgroup, then let's create it here now (the main cgroup was already
6024                          * realized by the unit logic) */
6025
6026                         r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
6027                         if (r < 0)
6028                                 return log_unit_error_errno(unit, r, "Failed to create subcgroup '%s': %m", subcgroup_path);
6029                 }
6030         }
6031
6032         pid = fork();
6033         if (pid < 0)
6034                 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
6035
6036         if (pid == 0) {
6037                 int exit_status = EXIT_SUCCESS;
6038
6039                 r = exec_child(unit,
6040                                command,
6041                                context,
6042                                params,
6043                                runtime,
6044                                cgroup_context,
6045                                socket_fd,
6046                                named_iofds,
6047                                fds,
6048                                n_socket_fds,
6049                                n_storage_fds,
6050                                files_env,
6051                                unit->manager->user_lookup_fds[1],
6052                                &exit_status);
6053
6054                 if (r < 0) {
6055                         const char *status =
6056                                 exit_status_to_string(exit_status,
6057                                                       EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
6058
6059                         log_unit_struct_errno(unit, LOG_ERR, r,
6060                                               "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
6061                                               LOG_UNIT_INVOCATION_ID(unit),
6062                                               LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
6063                                                                status, command->path),
6064                                               "EXECUTABLE=%s", command->path);
6065                 }
6066
6067                 _exit(exit_status);
6068         }
6069
6070         log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
6071
6072         /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
6073          * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
6074          * process will be killed too). */
6075         if (subcgroup_path)
6076                 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
6077
6078         exec_status_start(&command->exec_status, pid);
6079
6080         *ret = pid;
6081         return 0;
6082 }
6083
6084 void exec_context_init(ExecContext *c) {
6085         assert(c);
6086
6087         c->umask = 0022;
6088         c->ioprio = IOPRIO_DEFAULT_CLASS_AND_PRIO;
6089         c->cpu_sched_policy = SCHED_OTHER;
6090         c->syslog_priority = LOG_DAEMON|LOG_INFO;
6091         c->syslog_level_prefix = true;
6092         c->ignore_sigpipe = true;
6093         c->timer_slack_nsec = NSEC_INFINITY;
6094         c->personality = PERSONALITY_INVALID;
6095         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
6096                 c->directories[t].mode = 0755;
6097         c->timeout_clean_usec = USEC_INFINITY;
6098         c->capability_bounding_set = CAP_MASK_UNSET;
6099         assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
6100         c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
6101         c->log_level_max = -1;
6102 #if HAVE_SECCOMP
6103         c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL;
6104 #endif
6105         c->tty_rows = UINT_MAX;
6106         c->tty_cols = UINT_MAX;
6107         numa_policy_reset(&c->numa_policy);
6108         c->private_mounts = -1;
6109         c->memory_ksm = -1;
6110 }
6111
6112 void exec_context_done(ExecContext *c) {
6113         assert(c);
6114
6115         c->environment = strv_free(c->environment);
6116         c->environment_files = strv_free(c->environment_files);
6117         c->pass_environment = strv_free(c->pass_environment);
6118         c->unset_environment = strv_free(c->unset_environment);
6119
6120         rlimit_free_all(c->rlimit);
6121
6122         for (size_t l = 0; l < 3; l++) {
6123                 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
6124                 c->stdio_file[l] = mfree(c->stdio_file[l]);
6125         }
6126
6127         c->working_directory = mfree(c->working_directory);
6128         c->root_directory = mfree(c->root_directory);
6129         c->root_image = mfree(c->root_image);
6130         c->root_image_options = mount_options_free_all(c->root_image_options);
6131         c->root_hash = mfree(c->root_hash);
6132         c->root_hash_size = 0;
6133         c->root_hash_path = mfree(c->root_hash_path);
6134         c->root_hash_sig = mfree(c->root_hash_sig);
6135         c->root_hash_sig_size = 0;
6136         c->root_hash_sig_path = mfree(c->root_hash_sig_path);
6137         c->root_verity = mfree(c->root_verity);
6138         c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
6139         c->extension_directories = strv_free(c->extension_directories);
6140         c->tty_path = mfree(c->tty_path);
6141         c->syslog_identifier = mfree(c->syslog_identifier);
6142         c->user = mfree(c->user);
6143         c->group = mfree(c->group);
6144
6145         c->supplementary_groups = strv_free(c->supplementary_groups);
6146
6147         c->pam_name = mfree(c->pam_name);
6148
6149         c->read_only_paths = strv_free(c->read_only_paths);
6150         c->read_write_paths = strv_free(c->read_write_paths);
6151         c->inaccessible_paths = strv_free(c->inaccessible_paths);
6152         c->exec_paths = strv_free(c->exec_paths);
6153         c->no_exec_paths = strv_free(c->no_exec_paths);
6154         c->exec_search_path = strv_free(c->exec_search_path);
6155
6156         bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
6157         c->bind_mounts = NULL;
6158         c->n_bind_mounts = 0;
6159         temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
6160         c->temporary_filesystems = NULL;
6161         c->n_temporary_filesystems = 0;
6162         c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
6163
6164         cpu_set_reset(&c->cpu_set);
6165         numa_policy_reset(&c->numa_policy);
6166
6167         c->utmp_id = mfree(c->utmp_id);
6168         c->selinux_context = mfree(c->selinux_context);
6169         c->apparmor_profile = mfree(c->apparmor_profile);
6170         c->smack_process_label = mfree(c->smack_process_label);
6171
6172         c->restrict_filesystems = set_free_free(c->restrict_filesystems);
6173
6174         c->syscall_filter = hashmap_free(c->syscall_filter);
6175         c->syscall_archs = set_free(c->syscall_archs);
6176         c->address_families = set_free(c->address_families);
6177
6178         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
6179                 exec_directory_done(&c->directories[t]);
6180
6181         c->log_level_max = -1;
6182
6183         exec_context_free_log_extra_fields(c);
6184         c->log_filter_allowed_patterns = set_free_free(c->log_filter_allowed_patterns);
6185         c->log_filter_denied_patterns = set_free_free(c->log_filter_denied_patterns);
6186
6187         c->log_ratelimit_interval_usec = 0;
6188         c->log_ratelimit_burst = 0;
6189
6190         c->stdin_data = mfree(c->stdin_data);
6191         c->stdin_data_size = 0;
6192
6193         c->network_namespace_path = mfree(c->network_namespace_path);
6194         c->ipc_namespace_path = mfree(c->ipc_namespace_path);
6195
6196         c->log_namespace = mfree(c->log_namespace);
6197
6198         c->load_credentials = hashmap_free(c->load_credentials);
6199         c->set_credentials = hashmap_free(c->set_credentials);
6200         c->import_credentials = set_free_free(c->import_credentials);
6201
6202         c->root_image_policy = image_policy_free(c->root_image_policy);
6203         c->mount_image_policy = image_policy_free(c->mount_image_policy);
6204         c->extension_image_policy = image_policy_free(c->extension_image_policy);
6205 }
6206
6207 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
6208         assert(c);
6209
6210         if (!runtime_prefix)
6211                 return 0;
6212
6213         for (size_t i = 0; i < c->directories[EXEC_DIRECTORY_RUNTIME].n_items; i++) {
6214                 _cleanup_free_ char *p = NULL;
6215
6216                 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
6217                         p = path_join(runtime_prefix, "private", c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
6218                 else
6219                         p = path_join(runtime_prefix, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
6220                 if (!p)
6221                         return -ENOMEM;
6222
6223                 /* We execute this synchronously, since we need to be sure this is gone when we start the
6224                  * service next. */
6225                 (void) rm_rf(p, REMOVE_ROOT);
6226
6227                 STRV_FOREACH(symlink, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].symlinks) {
6228                         _cleanup_free_ char *symlink_abs = NULL;
6229
6230                         if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
6231                                 symlink_abs = path_join(runtime_prefix, "private", *symlink);
6232                         else
6233                                 symlink_abs = path_join(runtime_prefix, *symlink);
6234                         if (!symlink_abs)
6235                                 return -ENOMEM;
6236
6237                         (void) unlink(symlink_abs);
6238                 }
6239         }
6240
6241         return 0;
6242 }
6243
6244 int exec_context_destroy_credentials(const ExecContext *c, const char *runtime_prefix, const char *unit) {
6245         _cleanup_free_ char *p = NULL;
6246
6247         assert(c);
6248
6249         if (!runtime_prefix || !unit)
6250                 return 0;
6251
6252         p = path_join(runtime_prefix, "credentials", unit);
6253         if (!p)
6254                 return -ENOMEM;
6255
6256         /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
6257          * unmount it, and afterwards remove the mount point */
6258         (void) umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW);
6259         (void) rm_rf(p, REMOVE_ROOT|REMOVE_CHMOD);
6260
6261         return 0;
6262 }
6263
6264 int exec_context_destroy_mount_ns_dir(Unit *u) {
6265         _cleanup_free_ char *p = NULL;
6266
6267         if (!u || !MANAGER_IS_SYSTEM(u->manager))
6268                 return 0;
6269
6270         p = path_join("/run/systemd/propagate/", u->id);
6271         if (!p)
6272                 return -ENOMEM;
6273
6274         /* This is only filled transiently (see mount_in_namespace()), should be empty or even non-existent*/
6275         if (rmdir(p) < 0 && errno != ENOENT)
6276                 log_unit_debug_errno(u, errno, "Unable to remove propagation dir '%s', ignoring: %m", p);
6277
6278         return 0;
6279 }
6280
6281 static void exec_command_done(ExecCommand *c) {
6282         assert(c);
6283
6284         c->path = mfree(c->path);
6285         c->argv = strv_free(c->argv);
6286 }
6287
6288 void exec_command_done_array(ExecCommand *c, size_t n) {
6289         for (size_t i = 0; i < n; i++)
6290                 exec_command_done(c+i);
6291 }
6292
6293 ExecCommand* exec_command_free_list(ExecCommand *c) {
6294         ExecCommand *i;
6295
6296         while ((i = c)) {
6297                 LIST_REMOVE(command, c, i);
6298                 exec_command_done(i);
6299                 free(i);
6300         }
6301
6302         return NULL;
6303 }
6304
6305 void exec_command_free_array(ExecCommand **c, size_t n) {
6306         for (size_t i = 0; i < n; i++)
6307                 c[i] = exec_command_free_list(c[i]);
6308 }
6309
6310 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
6311         for (size_t i = 0; i < n; i++)
6312                 exec_status_reset(&c[i].exec_status);
6313 }
6314
6315 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
6316         for (size_t i = 0; i < n; i++)
6317                 LIST_FOREACH(command, z, c[i])
6318                         exec_status_reset(&z->exec_status);
6319 }
6320
6321 typedef struct InvalidEnvInfo {
6322         const Unit *unit;
6323         const char *path;
6324 } InvalidEnvInfo;
6325
6326 static void invalid_env(const char *p, void *userdata) {
6327         InvalidEnvInfo *info = userdata;
6328
6329         log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
6330 }
6331
6332 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
6333         assert(c);
6334
6335         switch (fd_index) {
6336
6337         case STDIN_FILENO:
6338                 if (c->std_input != EXEC_INPUT_NAMED_FD)
6339                         return NULL;
6340
6341                 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
6342
6343         case STDOUT_FILENO:
6344                 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
6345                         return NULL;
6346
6347                 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
6348
6349         case STDERR_FILENO:
6350                 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
6351                         return NULL;
6352
6353                 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
6354
6355         default:
6356                 return NULL;
6357         }
6358 }
6359
6360 static int exec_context_named_iofds(
6361                 const ExecContext *c,
6362                 const ExecParameters *p,
6363                 int named_iofds[static 3]) {
6364
6365         size_t targets;
6366         const char* stdio_fdname[3];
6367         size_t n_fds;
6368
6369         assert(c);
6370         assert(p);
6371         assert(named_iofds);
6372
6373         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
6374                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
6375                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
6376
6377         for (size_t i = 0; i < 3; i++)
6378                 stdio_fdname[i] = exec_context_fdname(c, i);
6379
6380         n_fds = p->n_storage_fds + p->n_socket_fds;
6381
6382         for (size_t i = 0; i < n_fds  && targets > 0; i++)
6383                 if (named_iofds[STDIN_FILENO] < 0 &&
6384                     c->std_input == EXEC_INPUT_NAMED_FD &&
6385                     stdio_fdname[STDIN_FILENO] &&
6386                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
6387
6388                         named_iofds[STDIN_FILENO] = p->fds[i];
6389                         targets--;
6390
6391                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
6392                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
6393                            stdio_fdname[STDOUT_FILENO] &&
6394                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
6395
6396                         named_iofds[STDOUT_FILENO] = p->fds[i];
6397                         targets--;
6398
6399                 } else if (named_iofds[STDERR_FILENO] < 0 &&
6400                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
6401                            stdio_fdname[STDERR_FILENO] &&
6402                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
6403
6404                         named_iofds[STDERR_FILENO] = p->fds[i];
6405                         targets--;
6406                 }
6407
6408         return targets == 0 ? 0 : -ENOENT;
6409 }
6410
6411 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***ret) {
6412         _cleanup_strv_free_ char **v = NULL;
6413         int r;
6414
6415         assert(c);
6416         assert(ret);
6417
6418         STRV_FOREACH(i, c->environment_files) {
6419                 _cleanup_globfree_ glob_t pglob = {};
6420                 bool ignore = false;
6421                 char *fn = *i;
6422
6423                 if (fn[0] == '-') {
6424                         ignore = true;
6425                         fn++;
6426                 }
6427
6428                 if (!path_is_absolute(fn)) {
6429                         if (ignore)
6430                                 continue;
6431                         return -EINVAL;
6432                 }
6433
6434                 /* Filename supports globbing, take all matching files */
6435                 r = safe_glob(fn, 0, &pglob);
6436                 if (r < 0) {
6437                         if (ignore)
6438                                 continue;
6439                         return r;
6440                 }
6441
6442                 /* When we don't match anything, -ENOENT should be returned */
6443                 assert(pglob.gl_pathc > 0);
6444
6445                 for (size_t n = 0; n < pglob.gl_pathc; n++) {
6446                         _cleanup_strv_free_ char **p = NULL;
6447
6448                         r = load_env_file(NULL, pglob.gl_pathv[n], &p);
6449                         if (r < 0) {
6450                                 if (ignore)
6451                                         continue;
6452                                 return r;
6453                         }
6454
6455                         /* Log invalid environment variables with filename */
6456                         if (p) {
6457                                 InvalidEnvInfo info = {
6458                                         .unit = unit,
6459                                         .path = pglob.gl_pathv[n]
6460                                 };
6461
6462                                 p = strv_env_clean_with_callback(p, invalid_env, &info);
6463                         }
6464
6465                         if (!v)
6466                                 v = TAKE_PTR(p);
6467                         else {
6468                                 char **m = strv_env_merge(v, p);
6469                                 if (!m)
6470                                         return -ENOMEM;
6471
6472                                 strv_free_and_replace(v, m);
6473                         }
6474                 }
6475         }
6476
6477         *ret = TAKE_PTR(v);
6478
6479         return 0;
6480 }
6481
6482 static bool tty_may_match_dev_console(const char *tty) {
6483         _cleanup_free_ char *resolved = NULL;
6484
6485         if (!tty)
6486                 return true;
6487
6488         tty = skip_dev_prefix(tty);
6489
6490         /* trivial identity? */
6491         if (streq(tty, "console"))
6492                 return true;
6493
6494         if (resolve_dev_console(&resolved) < 0)
6495                 return true; /* if we could not resolve, assume it may */
6496
6497         /* "tty0" means the active VC, so it may be the same sometimes */
6498         return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
6499 }
6500
6501 static bool exec_context_may_touch_tty(const ExecContext *ec) {
6502         assert(ec);
6503
6504         return ec->tty_reset ||
6505                 ec->tty_vhangup ||
6506                 ec->tty_vt_disallocate ||
6507                 is_terminal_input(ec->std_input) ||
6508                 is_terminal_output(ec->std_output) ||
6509                 is_terminal_output(ec->std_error);
6510 }
6511
6512 bool exec_context_may_touch_console(const ExecContext *ec) {
6513
6514         return exec_context_may_touch_tty(ec) &&
6515                tty_may_match_dev_console(exec_context_tty_path(ec));
6516 }
6517
6518 static void strv_fprintf(FILE *f, char **l) {
6519         assert(f);
6520
6521         STRV_FOREACH(g, l)
6522                 fprintf(f, " %s", *g);
6523 }
6524
6525 static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
6526         assert(f);
6527         assert(prefix);
6528         assert(name);
6529
6530         if (!strv_isempty(strv)) {
6531                 fprintf(f, "%s%s:", prefix, name);
6532                 strv_fprintf(f, strv);
6533                 fputs("\n", f);
6534         }
6535 }
6536
6537 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
6538         int r;
6539
6540         assert(c);
6541         assert(f);
6542
6543         prefix = strempty(prefix);
6544
6545         fprintf(f,
6546                 "%sUMask: %04o\n"
6547                 "%sWorkingDirectory: %s\n"
6548                 "%sRootDirectory: %s\n"
6549                 "%sRootEphemeral: %s\n"
6550                 "%sNonBlocking: %s\n"
6551                 "%sPrivateTmp: %s\n"
6552                 "%sPrivateDevices: %s\n"
6553                 "%sProtectKernelTunables: %s\n"
6554                 "%sProtectKernelModules: %s\n"
6555                 "%sProtectKernelLogs: %s\n"
6556                 "%sProtectClock: %s\n"
6557                 "%sProtectControlGroups: %s\n"
6558                 "%sPrivateNetwork: %s\n"
6559                 "%sPrivateUsers: %s\n"
6560                 "%sProtectHome: %s\n"
6561                 "%sProtectSystem: %s\n"
6562                 "%sMountAPIVFS: %s\n"
6563                 "%sIgnoreSIGPIPE: %s\n"
6564                 "%sMemoryDenyWriteExecute: %s\n"
6565                 "%sRestrictRealtime: %s\n"
6566                 "%sRestrictSUIDSGID: %s\n"
6567                 "%sKeyringMode: %s\n"
6568                 "%sProtectHostname: %s\n"
6569                 "%sProtectProc: %s\n"
6570                 "%sProcSubset: %s\n",
6571                 prefix, c->umask,
6572                 prefix, empty_to_root(c->working_directory),
6573                 prefix, empty_to_root(c->root_directory),
6574                 prefix, yes_no(c->root_ephemeral),
6575                 prefix, yes_no(c->non_blocking),
6576                 prefix, yes_no(c->private_tmp),
6577                 prefix, yes_no(c->private_devices),
6578                 prefix, yes_no(c->protect_kernel_tunables),
6579                 prefix, yes_no(c->protect_kernel_modules),
6580                 prefix, yes_no(c->protect_kernel_logs),
6581                 prefix, yes_no(c->protect_clock),
6582                 prefix, yes_no(c->protect_control_groups),
6583                 prefix, yes_no(c->private_network),
6584                 prefix, yes_no(c->private_users),
6585                 prefix, protect_home_to_string(c->protect_home),
6586                 prefix, protect_system_to_string(c->protect_system),
6587                 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
6588                 prefix, yes_no(c->ignore_sigpipe),
6589                 prefix, yes_no(c->memory_deny_write_execute),
6590                 prefix, yes_no(c->restrict_realtime),
6591                 prefix, yes_no(c->restrict_suid_sgid),
6592                 prefix, exec_keyring_mode_to_string(c->keyring_mode),
6593                 prefix, yes_no(c->protect_hostname),
6594                 prefix, protect_proc_to_string(c->protect_proc),
6595                 prefix, proc_subset_to_string(c->proc_subset));
6596
6597         if (c->root_image)
6598                 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
6599
6600         if (c->root_image_options) {
6601                 fprintf(f, "%sRootImageOptions:", prefix);
6602                 LIST_FOREACH(mount_options, o, c->root_image_options)
6603                         if (!isempty(o->options))
6604                                 fprintf(f, " %s:%s",
6605                                         partition_designator_to_string(o->partition_designator),
6606                                         o->options);
6607                 fprintf(f, "\n");
6608         }
6609
6610         if (c->root_hash) {
6611                 _cleanup_free_ char *encoded = NULL;
6612                 encoded = hexmem(c->root_hash, c->root_hash_size);
6613                 if (encoded)
6614                         fprintf(f, "%sRootHash: %s\n", prefix, encoded);
6615         }
6616
6617         if (c->root_hash_path)
6618                 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
6619
6620         if (c->root_hash_sig) {
6621                 _cleanup_free_ char *encoded = NULL;
6622                 ssize_t len;
6623                 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
6624                 if (len)
6625                         fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
6626         }
6627
6628         if (c->root_hash_sig_path)
6629                 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
6630
6631         if (c->root_verity)
6632                 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
6633
6634         STRV_FOREACH(e, c->environment)
6635                 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
6636
6637         STRV_FOREACH(e, c->environment_files)
6638                 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
6639
6640         STRV_FOREACH(e, c->pass_environment)
6641                 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
6642
6643         STRV_FOREACH(e, c->unset_environment)
6644                 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
6645
6646         fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
6647
6648         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
6649                 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
6650
6651                 for (size_t i = 0; i < c->directories[dt].n_items; i++) {
6652                         fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].items[i].path);
6653
6654                         STRV_FOREACH(d, c->directories[dt].items[i].symlinks)
6655                                 fprintf(f, "%s%s: %s:%s\n", prefix, exec_directory_type_symlink_to_string(dt), c->directories[dt].items[i].path, *d);
6656                 }
6657         }
6658
6659         fprintf(f, "%sTimeoutCleanSec: %s\n", prefix, FORMAT_TIMESPAN(c->timeout_clean_usec, USEC_PER_SEC));
6660
6661         if (c->nice_set)
6662                 fprintf(f, "%sNice: %i\n", prefix, c->nice);
6663
6664         if (c->oom_score_adjust_set)
6665                 fprintf(f, "%sOOMScoreAdjust: %i\n", prefix, c->oom_score_adjust);
6666
6667         if (c->coredump_filter_set)
6668                 fprintf(f, "%sCoredumpFilter: 0x%"PRIx64"\n", prefix, c->coredump_filter);
6669
6670         for (unsigned i = 0; i < RLIM_NLIMITS; i++)
6671                 if (c->rlimit[i]) {
6672                         fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
6673                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
6674                         fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
6675                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
6676                 }
6677
6678         if (c->ioprio_set) {
6679                 _cleanup_free_ char *class_str = NULL;
6680
6681                 r = ioprio_class_to_string_alloc(ioprio_prio_class(c->ioprio), &class_str);
6682                 if (r >= 0)
6683                         fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
6684
6685                 fprintf(f, "%sIOPriority: %d\n", prefix, ioprio_prio_data(c->ioprio));
6686         }
6687
6688         if (c->cpu_sched_set) {
6689                 _cleanup_free_ char *policy_str = NULL;
6690
6691                 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
6692                 if (r >= 0)
6693                         fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
6694
6695                 fprintf(f,
6696                         "%sCPUSchedulingPriority: %i\n"
6697                         "%sCPUSchedulingResetOnFork: %s\n",
6698                         prefix, c->cpu_sched_priority,
6699                         prefix, yes_no(c->cpu_sched_reset_on_fork));
6700         }
6701
6702         if (c->cpu_set.set) {
6703                 _cleanup_free_ char *affinity = NULL;
6704
6705                 affinity = cpu_set_to_range_string(&c->cpu_set);
6706                 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
6707         }
6708
6709         if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
6710                 _cleanup_free_ char *nodes = NULL;
6711
6712                 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
6713                 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
6714                 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
6715         }
6716
6717         if (c->timer_slack_nsec != NSEC_INFINITY)
6718                 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
6719
6720         fprintf(f,
6721                 "%sStandardInput: %s\n"
6722                 "%sStandardOutput: %s\n"
6723                 "%sStandardError: %s\n",
6724                 prefix, exec_input_to_string(c->std_input),
6725                 prefix, exec_output_to_string(c->std_output),
6726                 prefix, exec_output_to_string(c->std_error));
6727
6728         if (c->std_input == EXEC_INPUT_NAMED_FD)
6729                 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
6730         if (c->std_output == EXEC_OUTPUT_NAMED_FD)
6731                 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
6732         if (c->std_error == EXEC_OUTPUT_NAMED_FD)
6733                 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
6734
6735         if (c->std_input == EXEC_INPUT_FILE)
6736                 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
6737         if (c->std_output == EXEC_OUTPUT_FILE)
6738                 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
6739         if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
6740                 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
6741         if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
6742                 fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
6743         if (c->std_error == EXEC_OUTPUT_FILE)
6744                 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
6745         if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
6746                 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
6747         if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
6748                 fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
6749
6750         if (c->tty_path)
6751                 fprintf(f,
6752                         "%sTTYPath: %s\n"
6753                         "%sTTYReset: %s\n"
6754                         "%sTTYVHangup: %s\n"
6755                         "%sTTYVTDisallocate: %s\n"
6756                         "%sTTYRows: %u\n"
6757                         "%sTTYColumns: %u\n",
6758                         prefix, c->tty_path,
6759                         prefix, yes_no(c->tty_reset),
6760                         prefix, yes_no(c->tty_vhangup),
6761                         prefix, yes_no(c->tty_vt_disallocate),
6762                         prefix, c->tty_rows,
6763                         prefix, c->tty_cols);
6764
6765         if (IN_SET(c->std_output,
6766                    EXEC_OUTPUT_KMSG,
6767                    EXEC_OUTPUT_JOURNAL,
6768                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
6769                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
6770             IN_SET(c->std_error,
6771                    EXEC_OUTPUT_KMSG,
6772                    EXEC_OUTPUT_JOURNAL,
6773                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
6774                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
6775
6776                 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
6777
6778                 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
6779                 if (r >= 0)
6780                         fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
6781
6782                 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
6783                 if (r >= 0)
6784                         fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
6785         }
6786
6787         if (c->log_level_max >= 0) {
6788                 _cleanup_free_ char *t = NULL;
6789
6790                 (void) log_level_to_string_alloc(c->log_level_max, &t);
6791
6792                 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
6793         }
6794
6795         if (c->log_ratelimit_interval_usec > 0)
6796                 fprintf(f,
6797                         "%sLogRateLimitIntervalSec: %s\n",
6798                         prefix, FORMAT_TIMESPAN(c->log_ratelimit_interval_usec, USEC_PER_SEC));
6799
6800         if (c->log_ratelimit_burst > 0)
6801                 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
6802
6803         if (!set_isempty(c->log_filter_allowed_patterns) || !set_isempty(c->log_filter_denied_patterns)) {
6804                 fprintf(f, "%sLogFilterPatterns:", prefix);
6805
6806                 char *pattern;
6807                 SET_FOREACH(pattern, c->log_filter_allowed_patterns)
6808                         fprintf(f, " %s", pattern);
6809                 SET_FOREACH(pattern, c->log_filter_denied_patterns)
6810                         fprintf(f, " ~%s", pattern);
6811                 fputc('\n', f);
6812         }
6813
6814         for (size_t j = 0; j < c->n_log_extra_fields; j++) {
6815                 fprintf(f, "%sLogExtraFields: ", prefix);
6816                 fwrite(c->log_extra_fields[j].iov_base,
6817                        1, c->log_extra_fields[j].iov_len,
6818                        f);
6819                 fputc('\n', f);
6820         }
6821
6822         if (c->log_namespace)
6823                 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
6824
6825         if (c->secure_bits) {
6826                 _cleanup_free_ char *str = NULL;
6827
6828                 r = secure_bits_to_string_alloc(c->secure_bits, &str);
6829                 if (r >= 0)
6830                         fprintf(f, "%sSecure Bits: %s\n", prefix, str);
6831         }
6832
6833         if (c->capability_bounding_set != CAP_MASK_UNSET) {
6834                 _cleanup_free_ char *str = NULL;
6835
6836                 r = capability_set_to_string(c->capability_bounding_set, &str);
6837                 if (r >= 0)
6838                         fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
6839         }
6840
6841         if (c->capability_ambient_set != 0) {
6842                 _cleanup_free_ char *str = NULL;
6843
6844                 r = capability_set_to_string(c->capability_ambient_set, &str);
6845                 if (r >= 0)
6846                         fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
6847         }
6848
6849         if (c->user)
6850                 fprintf(f, "%sUser: %s\n", prefix, c->user);
6851         if (c->group)
6852                 fprintf(f, "%sGroup: %s\n", prefix, c->group);
6853
6854         fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
6855
6856         strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
6857
6858         if (c->pam_name)
6859                 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
6860
6861         strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
6862         strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
6863         strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
6864         strv_dump(f, prefix, "ExecPaths", c->exec_paths);
6865         strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
6866         strv_dump(f, prefix, "ExecSearchPath", c->exec_search_path);
6867
6868         for (size_t i = 0; i < c->n_bind_mounts; i++)
6869                 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
6870                         c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
6871                         c->bind_mounts[i].ignore_enoent ? "-": "",
6872                         c->bind_mounts[i].source,
6873                         c->bind_mounts[i].destination,
6874                         c->bind_mounts[i].recursive ? "rbind" : "norbind");
6875
6876         for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
6877                 const TemporaryFileSystem *t = c->temporary_filesystems + i;
6878
6879                 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
6880                         t->path,
6881                         isempty(t->options) ? "" : ":",
6882                         strempty(t->options));
6883         }
6884
6885         if (c->utmp_id)
6886                 fprintf(f,
6887                         "%sUtmpIdentifier: %s\n",
6888                         prefix, c->utmp_id);
6889
6890         if (c->selinux_context)
6891                 fprintf(f,
6892                         "%sSELinuxContext: %s%s\n",
6893                         prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
6894
6895         if (c->apparmor_profile)
6896                 fprintf(f,
6897                         "%sAppArmorProfile: %s%s\n",
6898                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
6899
6900         if (c->smack_process_label)
6901                 fprintf(f,
6902                         "%sSmackProcessLabel: %s%s\n",
6903                         prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
6904
6905         if (c->personality != PERSONALITY_INVALID)
6906                 fprintf(f,
6907                         "%sPersonality: %s\n",
6908                         prefix, strna(personality_to_string(c->personality)));
6909
6910         fprintf(f,
6911                 "%sLockPersonality: %s\n",
6912                 prefix, yes_no(c->lock_personality));
6913
6914         if (c->syscall_filter) {
6915                 fprintf(f,
6916                         "%sSystemCallFilter: ",
6917                         prefix);
6918
6919                 if (!c->syscall_allow_list)
6920                         fputc('~', f);
6921
6922 #if HAVE_SECCOMP
6923                 void *id, *val;
6924                 bool first = true;
6925                 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
6926                         _cleanup_free_ char *name = NULL;
6927                         const char *errno_name = NULL;
6928                         int num = PTR_TO_INT(val);
6929
6930                         if (first)
6931                                 first = false;
6932                         else
6933                                 fputc(' ', f);
6934
6935                         name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
6936                         fputs(strna(name), f);
6937
6938                         if (num >= 0) {
6939                                 errno_name = seccomp_errno_or_action_to_string(num);
6940                                 if (errno_name)
6941                                         fprintf(f, ":%s", errno_name);
6942                                 else
6943                                         fprintf(f, ":%d", num);
6944                         }
6945                 }
6946 #endif
6947
6948                 fputc('\n', f);
6949         }
6950
6951         if (c->syscall_archs) {
6952                 fprintf(f,
6953                         "%sSystemCallArchitectures:",
6954                         prefix);
6955
6956 #if HAVE_SECCOMP
6957                 void *id;
6958                 SET_FOREACH(id, c->syscall_archs)
6959                         fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
6960 #endif
6961                 fputc('\n', f);
6962         }
6963
6964         if (exec_context_restrict_namespaces_set(c)) {
6965                 _cleanup_free_ char *s = NULL;
6966
6967                 r = namespace_flags_to_string(c->restrict_namespaces, &s);
6968                 if (r >= 0)
6969                         fprintf(f, "%sRestrictNamespaces: %s\n",
6970                                 prefix, strna(s));
6971         }
6972
6973 #if HAVE_LIBBPF
6974         if (exec_context_restrict_filesystems_set(c)) {
6975                 char *fs;
6976                 SET_FOREACH(fs, c->restrict_filesystems)
6977                         fprintf(f, "%sRestrictFileSystems: %s\n", prefix, fs);
6978         }
6979 #endif
6980
6981         if (c->network_namespace_path)
6982                 fprintf(f,
6983                         "%sNetworkNamespacePath: %s\n",
6984                         prefix, c->network_namespace_path);
6985
6986         if (c->syscall_errno > 0) {
6987                 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
6988
6989 #if HAVE_SECCOMP
6990                 const char *errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
6991                 if (errno_name)
6992                         fputs(errno_name, f);
6993                 else
6994                         fprintf(f, "%d", c->syscall_errno);
6995 #endif
6996                 fputc('\n', f);
6997         }
6998
6999         for (size_t i = 0; i < c->n_mount_images; i++) {
7000                 fprintf(f, "%sMountImages: %s%s:%s", prefix,
7001                         c->mount_images[i].ignore_enoent ? "-": "",
7002                         c->mount_images[i].source,
7003                         c->mount_images[i].destination);
7004                 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
7005                         fprintf(f, ":%s:%s",
7006                                 partition_designator_to_string(o->partition_designator),
7007                                 strempty(o->options));
7008                 fprintf(f, "\n");
7009         }
7010
7011         for (size_t i = 0; i < c->n_extension_images; i++) {
7012                 fprintf(f, "%sExtensionImages: %s%s", prefix,
7013                         c->extension_images[i].ignore_enoent ? "-": "",
7014                         c->extension_images[i].source);
7015                 LIST_FOREACH(mount_options, o, c->extension_images[i].mount_options)
7016                         fprintf(f, ":%s:%s",
7017                                 partition_designator_to_string(o->partition_designator),
7018                                 strempty(o->options));
7019                 fprintf(f, "\n");
7020         }
7021
7022         strv_dump(f, prefix, "ExtensionDirectories", c->extension_directories);
7023 }
7024
7025 bool exec_context_maintains_privileges(const ExecContext *c) {
7026         assert(c);
7027
7028         /* Returns true if the process forked off would run under
7029          * an unchanged UID or as root. */
7030
7031         if (!c->user)
7032                 return true;
7033
7034         if (streq(c->user, "root") || streq(c->user, "0"))
7035                 return true;
7036
7037         return false;
7038 }
7039
7040 int exec_context_get_effective_ioprio(const ExecContext *c) {
7041         int p;
7042
7043         assert(c);
7044
7045         if (c->ioprio_set)
7046                 return c->ioprio;
7047
7048         p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
7049         if (p < 0)
7050                 return IOPRIO_DEFAULT_CLASS_AND_PRIO;
7051
7052         return ioprio_normalize(p);
7053 }
7054
7055 bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
7056         assert(c);
7057
7058         /* Explicit setting wins */
7059         if (c->mount_apivfs_set)
7060                 return c->mount_apivfs;
7061
7062         /* Default to "yes" if root directory or image are specified */
7063         if (exec_context_with_rootfs(c))
7064                 return true;
7065
7066         return false;
7067 }
7068
7069 void exec_context_free_log_extra_fields(ExecContext *c) {
7070         assert(c);
7071
7072         for (size_t l = 0; l < c->n_log_extra_fields; l++)
7073                 free(c->log_extra_fields[l].iov_base);
7074         c->log_extra_fields = mfree(c->log_extra_fields);
7075         c->n_log_extra_fields = 0;
7076 }
7077
7078 void exec_context_revert_tty(ExecContext *c) {
7079         _cleanup_close_ int fd = -EBADF;
7080         const char *path;
7081         struct stat st;
7082         int r;
7083
7084         assert(c);
7085
7086         /* First, reset the TTY (possibly kicking everybody else from the TTY) */
7087         exec_context_tty_reset(c, NULL);
7088
7089         /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
7090          * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
7091          * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
7092         if (!exec_context_may_touch_tty(c))
7093                 return;
7094
7095         path = exec_context_tty_path(c);
7096         if (!path)
7097                 return;
7098
7099         fd = open(path, O_PATH|O_CLOEXEC);
7100         if (fd < 0)
7101                 return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
7102                                              "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
7103                                              path);
7104
7105         if (fstat(fd, &st) < 0)
7106                 return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
7107
7108         /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
7109          * if things are a character device, since a proper check either means we'd have to open the TTY and
7110          * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
7111          * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
7112          * with this at all? → https://github.com/systemd/systemd/issues/19213 */
7113         if (!S_ISCHR(st.st_mode))
7114                 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
7115
7116         r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
7117         if (r < 0)
7118                 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
7119 }
7120
7121 int exec_context_get_clean_directories(
7122                 ExecContext *c,
7123                 char **prefix,
7124                 ExecCleanMask mask,
7125                 char ***ret) {
7126
7127         _cleanup_strv_free_ char **l = NULL;
7128         int r;
7129
7130         assert(c);
7131         assert(prefix);
7132         assert(ret);
7133
7134         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
7135                 if (!FLAGS_SET(mask, 1U << t))
7136                         continue;
7137
7138                 if (!prefix[t])
7139                         continue;
7140
7141                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
7142                         char *j;
7143
7144                         j = path_join(prefix[t], c->directories[t].items[i].path);
7145                         if (!j)
7146                                 return -ENOMEM;
7147
7148                         r = strv_consume(&l, j);
7149                         if (r < 0)
7150                                 return r;
7151
7152                         /* Also remove private directories unconditionally. */
7153                         if (t != EXEC_DIRECTORY_CONFIGURATION) {
7154                                 j = path_join(prefix[t], "private", c->directories[t].items[i].path);
7155                                 if (!j)
7156                                         return -ENOMEM;
7157
7158                                 r = strv_consume(&l, j);
7159                                 if (r < 0)
7160                                         return r;
7161                         }
7162
7163                         STRV_FOREACH(symlink, c->directories[t].items[i].symlinks) {
7164                                 j = path_join(prefix[t], *symlink);
7165                                 if (!j)
7166                                         return -ENOMEM;
7167
7168                                 r = strv_consume(&l, j);
7169                                 if (r < 0)
7170                                         return r;
7171                         }
7172                 }
7173         }
7174
7175         *ret = TAKE_PTR(l);
7176         return 0;
7177 }
7178
7179 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
7180         ExecCleanMask mask = 0;
7181
7182         assert(c);
7183         assert(ret);
7184
7185         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
7186                 if (c->directories[t].n_items > 0)
7187                         mask |= 1U << t;
7188
7189         *ret = mask;
7190         return 0;
7191 }
7192
7193 bool exec_context_has_encrypted_credentials(ExecContext *c) {
7194         ExecLoadCredential *load_cred;
7195         ExecSetCredential *set_cred;
7196
7197         assert(c);
7198
7199         HASHMAP_FOREACH(load_cred, c->load_credentials)
7200                 if (load_cred->encrypted)
7201                         return true;
7202
7203         HASHMAP_FOREACH(set_cred, c->set_credentials)
7204                 if (set_cred->encrypted)
7205                         return true;
7206
7207         return false;
7208 }
7209
7210 void exec_status_start(ExecStatus *s, pid_t pid) {
7211         assert(s);
7212
7213         *s = (ExecStatus) {
7214                 .pid = pid,
7215         };
7216
7217         dual_timestamp_get(&s->start_timestamp);
7218 }
7219
7220 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
7221         assert(s);
7222
7223         if (s->pid != pid)
7224                 *s = (ExecStatus) {
7225                         .pid = pid,
7226                 };
7227
7228         dual_timestamp_get(&s->exit_timestamp);
7229
7230         s->code = code;
7231         s->status = status;
7232
7233         if (context && context->utmp_id)
7234                 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
7235 }
7236
7237 void exec_status_reset(ExecStatus *s) {
7238         assert(s);
7239
7240         *s = (ExecStatus) {};
7241 }
7242
7243 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
7244         assert(s);
7245         assert(f);
7246
7247         if (s->pid <= 0)
7248                 return;
7249
7250         prefix = strempty(prefix);
7251
7252         fprintf(f,
7253                 "%sPID: "PID_FMT"\n",
7254                 prefix, s->pid);
7255
7256         if (dual_timestamp_is_set(&s->start_timestamp))
7257                 fprintf(f,
7258                         "%sStart Timestamp: %s\n",
7259                         prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
7260
7261         if (dual_timestamp_is_set(&s->exit_timestamp))
7262                 fprintf(f,
7263                         "%sExit Timestamp: %s\n"
7264                         "%sExit Code: %s\n"
7265                         "%sExit Status: %i\n",
7266                         prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime),
7267                         prefix, sigchld_code_to_string(s->code),
7268                         prefix, s->status);
7269 }
7270
7271 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
7272         _cleanup_free_ char *cmd = NULL;
7273         const char *prefix2;
7274
7275         assert(c);
7276         assert(f);
7277
7278         prefix = strempty(prefix);
7279         prefix2 = strjoina(prefix, "\t");
7280
7281         cmd = quote_command_line(c->argv, SHELL_ESCAPE_EMPTY);
7282
7283         fprintf(f,
7284                 "%sCommand Line: %s\n",
7285                 prefix, strnull(cmd));
7286
7287         exec_status_dump(&c->exec_status, f, prefix2);
7288 }
7289
7290 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
7291         assert(f);
7292
7293         prefix = strempty(prefix);
7294
7295         LIST_FOREACH(command, i, c)
7296                 exec_command_dump(i, f, prefix);
7297 }
7298
7299 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
7300         ExecCommand *end;
7301
7302         assert(l);
7303         assert(e);
7304
7305         if (*l) {
7306                 /* It's kind of important, that we keep the order here */
7307                 end = LIST_FIND_TAIL(command, *l);
7308                 LIST_INSERT_AFTER(command, *l, end, e);
7309         } else
7310                 *l = e;
7311 }
7312
7313 int exec_command_set(ExecCommand *c, const char *path, ...) {
7314         va_list ap;
7315         char **l, *p;
7316
7317         assert(c);
7318         assert(path);
7319
7320         va_start(ap, path);
7321         l = strv_new_ap(path, ap);
7322         va_end(ap);
7323
7324         if (!l)
7325                 return -ENOMEM;
7326
7327         p = strdup(path);
7328         if (!p) {
7329                 strv_free(l);
7330                 return -ENOMEM;
7331         }
7332
7333         free_and_replace(c->path, p);
7334
7335         return strv_free_and_replace(c->argv, l);
7336 }
7337
7338 int exec_command_append(ExecCommand *c, const char *path, ...) {
7339         _cleanup_strv_free_ char **l = NULL;
7340         va_list ap;
7341         int r;
7342
7343         assert(c);
7344         assert(path);
7345
7346         va_start(ap, path);
7347         l = strv_new_ap(path, ap);
7348         va_end(ap);
7349
7350         if (!l)
7351                 return -ENOMEM;
7352
7353         r = strv_extend_strv(&c->argv, l, false);
7354         if (r < 0)
7355                 return r;
7356
7357         return 0;
7358 }
7359
7360 static char *destroy_tree(char *path) {
7361         if (!path)
7362                 return NULL;
7363
7364         if (!path_equal(path, RUN_SYSTEMD_EMPTY)) {
7365                 log_debug("Spawning process to nuke '%s'", path);
7366
7367                 (void) asynchronous_rm_rf(path, REMOVE_ROOT|REMOVE_SUBVOLUME|REMOVE_PHYSICAL);
7368         }
7369
7370         return mfree(path);
7371 }
7372
7373 static ExecSharedRuntime* exec_shared_runtime_free(ExecSharedRuntime *rt) {
7374         if (!rt)
7375                 return NULL;
7376
7377         if (rt->manager)
7378                 (void) hashmap_remove(rt->manager->exec_shared_runtime_by_id, rt->id);
7379
7380         rt->id = mfree(rt->id);
7381         rt->tmp_dir = mfree(rt->tmp_dir);
7382         rt->var_tmp_dir = mfree(rt->var_tmp_dir);
7383         safe_close_pair(rt->netns_storage_socket);
7384         safe_close_pair(rt->ipcns_storage_socket);
7385         return mfree(rt);
7386 }
7387
7388 DEFINE_TRIVIAL_UNREF_FUNC(ExecSharedRuntime, exec_shared_runtime, exec_shared_runtime_free);
7389 DEFINE_TRIVIAL_CLEANUP_FUNC(ExecSharedRuntime*, exec_shared_runtime_free);
7390
7391 ExecSharedRuntime* exec_shared_runtime_destroy(ExecSharedRuntime *rt) {
7392         if (!rt)
7393                 return NULL;
7394
7395         assert(rt->n_ref > 0);
7396         rt->n_ref--;
7397
7398         if (rt->n_ref > 0)
7399                 return NULL;
7400
7401         rt->tmp_dir = destroy_tree(rt->tmp_dir);
7402         rt->var_tmp_dir = destroy_tree(rt->var_tmp_dir);
7403
7404         return exec_shared_runtime_free(rt);
7405 }
7406
7407 static int exec_shared_runtime_allocate(ExecSharedRuntime **ret, const char *id) {
7408         _cleanup_free_ char *id_copy = NULL;
7409         ExecSharedRuntime *n;
7410
7411         assert(ret);
7412
7413         id_copy = strdup(id);
7414         if (!id_copy)
7415                 return -ENOMEM;
7416
7417         n = new(ExecSharedRuntime, 1);
7418         if (!n)
7419                 return -ENOMEM;
7420
7421         *n = (ExecSharedRuntime) {
7422                 .id = TAKE_PTR(id_copy),
7423                 .netns_storage_socket = PIPE_EBADF,
7424                 .ipcns_storage_socket = PIPE_EBADF,
7425         };
7426
7427         *ret = n;
7428         return 0;
7429 }
7430
7431 static int exec_shared_runtime_add(
7432                 Manager *m,
7433                 const char *id,
7434                 char **tmp_dir,
7435                 char **var_tmp_dir,
7436                 int netns_storage_socket[2],
7437                 int ipcns_storage_socket[2],
7438                 ExecSharedRuntime **ret) {
7439
7440         _cleanup_(exec_shared_runtime_freep) ExecSharedRuntime *rt = NULL;
7441         int r;
7442
7443         assert(m);
7444         assert(id);
7445
7446         /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
7447
7448         r = exec_shared_runtime_allocate(&rt, id);
7449         if (r < 0)
7450                 return r;
7451
7452         r = hashmap_ensure_put(&m->exec_shared_runtime_by_id, &string_hash_ops, rt->id, rt);
7453         if (r < 0)
7454                 return r;
7455
7456         assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
7457         rt->tmp_dir = TAKE_PTR(*tmp_dir);
7458         rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
7459
7460         if (netns_storage_socket) {
7461                 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
7462                 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
7463         }
7464
7465         if (ipcns_storage_socket) {
7466                 rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
7467                 rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
7468         }
7469
7470         rt->manager = m;
7471
7472         if (ret)
7473                 *ret = rt;
7474         /* do not remove created ExecSharedRuntime object when the operation succeeds. */
7475         TAKE_PTR(rt);
7476         return 0;
7477 }
7478
7479 static int exec_shared_runtime_make(
7480                 Manager *m,
7481                 const ExecContext *c,
7482                 const char *id,
7483                 ExecSharedRuntime **ret) {
7484
7485         _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
7486         _cleanup_close_pair_ int netns_storage_socket[2] = PIPE_EBADF, ipcns_storage_socket[2] = PIPE_EBADF;
7487         int r;
7488
7489         assert(m);
7490         assert(c);
7491         assert(id);
7492
7493         /* It is not necessary to create ExecSharedRuntime object. */
7494         if (!exec_needs_network_namespace(c) && !exec_needs_ipc_namespace(c) && !c->private_tmp) {
7495                 *ret = NULL;
7496                 return 0;
7497         }
7498
7499         if (c->private_tmp &&
7500             !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
7501               (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
7502                prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
7503                 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
7504                 if (r < 0)
7505                         return r;
7506         }
7507
7508         if (exec_needs_network_namespace(c)) {
7509                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
7510                         return -errno;
7511         }
7512
7513         if (exec_needs_ipc_namespace(c)) {
7514                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
7515                         return -errno;
7516         }
7517
7518         r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
7519         if (r < 0)
7520                 return r;
7521
7522         return 1;
7523 }
7524
7525 int exec_shared_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecSharedRuntime **ret) {
7526         ExecSharedRuntime *rt;
7527         int r;
7528
7529         assert(m);
7530         assert(id);
7531         assert(ret);
7532
7533         rt = hashmap_get(m->exec_shared_runtime_by_id, id);
7534         if (rt)
7535                 /* We already have an ExecSharedRuntime object, let's increase the ref count and reuse it */
7536                 goto ref;
7537
7538         if (!create) {
7539                 *ret = NULL;
7540                 return 0;
7541         }
7542
7543         /* If not found, then create a new object. */
7544         r = exec_shared_runtime_make(m, c, id, &rt);
7545         if (r < 0)
7546                 return r;
7547         if (r == 0) {
7548                 /* When r == 0, it is not necessary to create ExecSharedRuntime object. */
7549                 *ret = NULL;
7550                 return 0;
7551         }
7552
7553 ref:
7554         /* increment reference counter. */
7555         rt->n_ref++;
7556         *ret = rt;
7557         return 1;
7558 }
7559
7560 int exec_shared_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
7561         ExecSharedRuntime *rt;
7562
7563         assert(m);
7564         assert(f);
7565         assert(fds);
7566
7567         HASHMAP_FOREACH(rt, m->exec_shared_runtime_by_id) {
7568                 fprintf(f, "exec-runtime=%s", rt->id);
7569
7570                 if (rt->tmp_dir)
7571                         fprintf(f, " tmp-dir=%s", rt->tmp_dir);
7572
7573                 if (rt->var_tmp_dir)
7574                         fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
7575
7576                 if (rt->netns_storage_socket[0] >= 0) {
7577                         int copy;
7578
7579                         copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
7580                         if (copy < 0)
7581                                 return copy;
7582
7583                         fprintf(f, " netns-socket-0=%i", copy);
7584                 }
7585
7586                 if (rt->netns_storage_socket[1] >= 0) {
7587                         int copy;
7588
7589                         copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
7590                         if (copy < 0)
7591                                 return copy;
7592
7593                         fprintf(f, " netns-socket-1=%i", copy);
7594                 }
7595
7596                 if (rt->ipcns_storage_socket[0] >= 0) {
7597                         int copy;
7598
7599                         copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
7600                         if (copy < 0)
7601                                 return copy;
7602
7603                         fprintf(f, " ipcns-socket-0=%i", copy);
7604                 }
7605
7606                 if (rt->ipcns_storage_socket[1] >= 0) {
7607                         int copy;
7608
7609                         copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
7610                         if (copy < 0)
7611                                 return copy;
7612
7613                         fprintf(f, " ipcns-socket-1=%i", copy);
7614                 }
7615
7616                 fputc('\n', f);
7617         }
7618
7619         return 0;
7620 }
7621
7622 int exec_shared_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
7623         _cleanup_(exec_shared_runtime_freep) ExecSharedRuntime *rt_create = NULL;
7624         ExecSharedRuntime *rt;
7625         int r;
7626
7627         /* This is for the migration from old (v237 or earlier) deserialization text.
7628          * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
7629          * Even if the ExecSharedRuntime object originally created by the other unit, we cannot judge
7630          * so or not from the serialized text, then we always creates a new object owned by this. */
7631
7632         assert(u);
7633         assert(key);
7634         assert(value);
7635
7636         /* Manager manages ExecSharedRuntime objects by the unit id.
7637          * So, we omit the serialized text when the unit does not have id (yet?)... */
7638         if (isempty(u->id)) {
7639                 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
7640                 return 0;
7641         }
7642
7643         if (hashmap_ensure_allocated(&u->manager->exec_shared_runtime_by_id, &string_hash_ops) < 0)
7644                 return log_oom();
7645
7646         rt = hashmap_get(u->manager->exec_shared_runtime_by_id, u->id);
7647         if (!rt) {
7648                 if (exec_shared_runtime_allocate(&rt_create, u->id) < 0)
7649                         return log_oom();
7650
7651                 rt = rt_create;
7652         }
7653
7654         if (streq(key, "tmp-dir")) {
7655                 if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
7656                         return -ENOMEM;
7657
7658         } else if (streq(key, "var-tmp-dir")) {
7659                 if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
7660                         return -ENOMEM;
7661
7662         } else if (streq(key, "netns-socket-0")) {
7663                 int fd;
7664
7665                 if ((fd = parse_fd(value)) < 0 || !fdset_contains(fds, fd)) {
7666                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
7667                         return 0;
7668                 }
7669
7670                 safe_close(rt->netns_storage_socket[0]);
7671                 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
7672
7673         } else if (streq(key, "netns-socket-1")) {
7674                 int fd;
7675
7676                 if ((fd = parse_fd(value)) < 0 || !fdset_contains(fds, fd)) {
7677                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
7678                         return 0;
7679                 }
7680
7681                 safe_close(rt->netns_storage_socket[1]);
7682                 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
7683
7684         } else
7685                 return 0;
7686
7687         /* If the object is newly created, then put it to the hashmap which manages ExecSharedRuntime objects. */
7688         if (rt_create) {
7689                 r = hashmap_put(u->manager->exec_shared_runtime_by_id, rt_create->id, rt_create);
7690                 if (r < 0) {
7691                         log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
7692                         return 0;
7693                 }
7694
7695                 rt_create->manager = u->manager;
7696
7697                 /* Avoid cleanup */
7698                 TAKE_PTR(rt_create);
7699         }
7700
7701         return 1;
7702 }
7703
7704 int exec_shared_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
7705         _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
7706         char *id = NULL;
7707         int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
7708         const char *p, *v = ASSERT_PTR(value);
7709         size_t n;
7710
7711         assert(m);
7712         assert(fds);
7713
7714         n = strcspn(v, " ");
7715         id = strndupa_safe(v, n);
7716         if (v[n] != ' ')
7717                 goto finalize;
7718         p = v + n + 1;
7719
7720         v = startswith(p, "tmp-dir=");
7721         if (v) {
7722                 n = strcspn(v, " ");
7723                 tmp_dir = strndup(v, n);
7724                 if (!tmp_dir)
7725                         return log_oom();
7726                 if (v[n] != ' ')
7727                         goto finalize;
7728                 p = v + n + 1;
7729         }
7730
7731         v = startswith(p, "var-tmp-dir=");
7732         if (v) {
7733                 n = strcspn(v, " ");
7734                 var_tmp_dir = strndup(v, n);
7735                 if (!var_tmp_dir)
7736                         return log_oom();
7737                 if (v[n] != ' ')
7738                         goto finalize;
7739                 p = v + n + 1;
7740         }
7741
7742         v = startswith(p, "netns-socket-0=");
7743         if (v) {
7744                 char *buf;
7745
7746                 n = strcspn(v, " ");
7747                 buf = strndupa_safe(v, n);
7748
7749                 netns_fdpair[0] = parse_fd(buf);
7750                 if (netns_fdpair[0] < 0)
7751                         return log_debug_errno(netns_fdpair[0], "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
7752                 if (!fdset_contains(fds, netns_fdpair[0]))
7753                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7754                                                "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair[0]);
7755                 netns_fdpair[0] = fdset_remove(fds, netns_fdpair[0]);
7756                 if (v[n] != ' ')
7757                         goto finalize;
7758                 p = v + n + 1;
7759         }
7760
7761         v = startswith(p, "netns-socket-1=");
7762         if (v) {
7763                 char *buf;
7764
7765                 n = strcspn(v, " ");
7766                 buf = strndupa_safe(v, n);
7767
7768                 netns_fdpair[1] = parse_fd(buf);
7769                 if (netns_fdpair[1] < 0)
7770                         return log_debug_errno(netns_fdpair[1], "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
7771                 if (!fdset_contains(fds, netns_fdpair[1]))
7772                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7773                                                "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair[1]);
7774                 netns_fdpair[1] = fdset_remove(fds, netns_fdpair[1]);
7775                 if (v[n] != ' ')
7776                         goto finalize;
7777                 p = v + n + 1;
7778         }
7779
7780         v = startswith(p, "ipcns-socket-0=");
7781         if (v) {
7782                 char *buf;
7783
7784                 n = strcspn(v, " ");
7785                 buf = strndupa_safe(v, n);
7786
7787                 ipcns_fdpair[0] = parse_fd(buf);
7788                 if (ipcns_fdpair[0] < 0)
7789                         return log_debug_errno(ipcns_fdpair[0], "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf);
7790                 if (!fdset_contains(fds, ipcns_fdpair[0]))
7791                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7792                                                "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair[0]);
7793                 ipcns_fdpair[0] = fdset_remove(fds, ipcns_fdpair[0]);
7794                 if (v[n] != ' ')
7795                         goto finalize;
7796                 p = v + n + 1;
7797         }
7798
7799         v = startswith(p, "ipcns-socket-1=");
7800         if (v) {
7801                 char *buf;
7802
7803                 n = strcspn(v, " ");
7804                 buf = strndupa_safe(v, n);
7805
7806                 ipcns_fdpair[1] = parse_fd(buf);
7807                 if (ipcns_fdpair[1] < 0)
7808                         return log_debug_errno(ipcns_fdpair[1], "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf);
7809                 if (!fdset_contains(fds, ipcns_fdpair[1]))
7810                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7811                                                "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair[1]);
7812                 ipcns_fdpair[1] = fdset_remove(fds, ipcns_fdpair[1]);
7813         }
7814
7815 finalize:
7816         r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
7817         if (r < 0)
7818                 return log_debug_errno(r, "Failed to add exec-runtime: %m");
7819         return 0;
7820 }
7821
7822 void exec_shared_runtime_vacuum(Manager *m) {
7823         ExecSharedRuntime *rt;
7824
7825         assert(m);
7826
7827         /* Free unreferenced ExecSharedRuntime objects. This is used after manager deserialization process. */
7828
7829         HASHMAP_FOREACH(rt, m->exec_shared_runtime_by_id) {
7830                 if (rt->n_ref > 0)
7831                         continue;
7832
7833                 (void) exec_shared_runtime_free(rt);
7834         }
7835 }
7836
7837 int exec_runtime_make(
7838                 const Unit *unit,
7839                 const ExecContext *context,
7840                 ExecSharedRuntime *shared,
7841                 DynamicCreds *creds,
7842                 ExecRuntime **ret) {
7843         _cleanup_close_pair_ int ephemeral_storage_socket[2] = PIPE_EBADF;
7844         _cleanup_free_ char *ephemeral = NULL;
7845         _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
7846         int r;
7847
7848         assert(unit);
7849         assert(context);
7850         assert(ret);
7851
7852         if (!shared && !creds && !exec_needs_ephemeral(context)) {
7853                 *ret = NULL;
7854                 return 0;
7855         }
7856
7857         if (exec_needs_ephemeral(context)) {
7858                 r = mkdir_p("/var/lib/systemd/ephemeral-trees", 0755);
7859                 if (r < 0)
7860                         return r;
7861
7862                 r = tempfn_random_child("/var/lib/systemd/ephemeral-trees", unit->id, &ephemeral);
7863                 if (r < 0)
7864                         return r;
7865
7866                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ephemeral_storage_socket) < 0)
7867                         return -errno;
7868         }
7869
7870         rt = new(ExecRuntime, 1);
7871         if (!rt)
7872                 return -ENOMEM;
7873
7874         *rt = (ExecRuntime) {
7875                 .shared = shared,
7876                 .dynamic_creds = creds,
7877                 .ephemeral_copy = TAKE_PTR(ephemeral),
7878                 .ephemeral_storage_socket[0] = TAKE_FD(ephemeral_storage_socket[0]),
7879                 .ephemeral_storage_socket[1] = TAKE_FD(ephemeral_storage_socket[1]),
7880         };
7881
7882         *ret = TAKE_PTR(rt);
7883         return 1;
7884 }
7885
7886 ExecRuntime* exec_runtime_free(ExecRuntime *rt) {
7887         if (!rt)
7888                 return NULL;
7889
7890         exec_shared_runtime_unref(rt->shared);
7891         dynamic_creds_unref(rt->dynamic_creds);
7892
7893         rt->ephemeral_copy = destroy_tree(rt->ephemeral_copy);
7894
7895         safe_close_pair(rt->ephemeral_storage_socket);
7896         return mfree(rt);
7897 }
7898
7899 ExecRuntime* exec_runtime_destroy(ExecRuntime *rt) {
7900         if (!rt)
7901                 return NULL;
7902
7903         rt->shared = exec_shared_runtime_destroy(rt->shared);
7904         rt->dynamic_creds = dynamic_creds_destroy(rt->dynamic_creds);
7905         return exec_runtime_free(rt);
7906 }
7907
7908 void exec_params_clear(ExecParameters *p) {
7909         if (!p)
7910                 return;
7911
7912         p->environment = strv_free(p->environment);
7913         p->fd_names = strv_free(p->fd_names);
7914         p->fds = mfree(p->fds);
7915         p->exec_fd = safe_close(p->exec_fd);
7916 }
7917
7918 ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) {
7919         if (!sc)
7920                 return NULL;
7921
7922         free(sc->id);
7923         free(sc->data);
7924         return mfree(sc);
7925 }
7926
7927 ExecLoadCredential *exec_load_credential_free(ExecLoadCredential *lc) {
7928         if (!lc)
7929                 return NULL;
7930
7931         free(lc->id);
7932         free(lc->path);
7933         return mfree(lc);
7934 }
7935
7936 void exec_directory_done(ExecDirectory *d) {
7937         if (!d)
7938                 return;
7939
7940         for (size_t i = 0; i < d->n_items; i++) {
7941                 free(d->items[i].path);
7942                 strv_free(d->items[i].symlinks);
7943         }
7944
7945         d->items = mfree(d->items);
7946         d->n_items = 0;
7947         d->mode = 0755;
7948 }
7949
7950 static ExecDirectoryItem *exec_directory_find(ExecDirectory *d, const char *path) {
7951         assert(d);
7952         assert(path);
7953
7954         for (size_t i = 0; i < d->n_items; i++)
7955                 if (path_equal(d->items[i].path, path))
7956                         return &d->items[i];
7957
7958         return NULL;
7959 }
7960
7961 int exec_directory_add(ExecDirectory *d, const char *path, const char *symlink) {
7962         _cleanup_strv_free_ char **s = NULL;
7963         _cleanup_free_ char *p = NULL;
7964         ExecDirectoryItem *existing;
7965         int r;
7966
7967         assert(d);
7968         assert(path);
7969
7970         existing = exec_directory_find(d, path);
7971         if (existing) {
7972                 r = strv_extend(&existing->symlinks, symlink);
7973                 if (r < 0)
7974                         return r;
7975
7976                 return 0; /* existing item is updated */
7977         }
7978
7979         p = strdup(path);
7980         if (!p)
7981                 return -ENOMEM;
7982
7983         if (symlink) {
7984                 s = strv_new(symlink);
7985                 if (!s)
7986                         return -ENOMEM;
7987         }
7988
7989         if (!GREEDY_REALLOC(d->items, d->n_items + 1))
7990                 return -ENOMEM;
7991
7992         d->items[d->n_items++] = (ExecDirectoryItem) {
7993                 .path = TAKE_PTR(p),
7994                 .symlinks = TAKE_PTR(s),
7995         };
7996
7997         return 1; /* new item is added */
7998 }
7999
8000 static int exec_directory_item_compare_func(const ExecDirectoryItem *a, const ExecDirectoryItem *b) {
8001         assert(a);
8002         assert(b);
8003
8004         return path_compare(a->path, b->path);
8005 }
8006
8007 void exec_directory_sort(ExecDirectory *d) {
8008         assert(d);
8009
8010         /* Sort the exec directories to make always parent directories processed at first in
8011          * setup_exec_directory(), e.g., even if StateDirectory=foo/bar foo, we need to create foo at first,
8012          * then foo/bar. Also, set .only_create flag if one of the parent directories is contained in the
8013          * list. See also comments in setup_exec_directory() and issue #24783. */
8014
8015         if (d->n_items <= 1)
8016                 return;
8017
8018         typesafe_qsort(d->items, d->n_items, exec_directory_item_compare_func);
8019
8020         for (size_t i = 1; i < d->n_items; i++)
8021                 for (size_t j = 0; j < i; j++)
8022                         if (path_startswith(d->items[i].path, d->items[j].path)) {
8023                                 d->items[i].only_create = true;
8024                                 break;
8025                         }
8026 }
8027
8028 ExecCleanMask exec_clean_mask_from_string(const char *s) {
8029         ExecDirectoryType t;
8030
8031         assert(s);
8032
8033         if (streq(s, "all"))
8034                 return EXEC_CLEAN_ALL;
8035         if (streq(s, "fdstore"))
8036                 return EXEC_CLEAN_FDSTORE;
8037
8038         t = exec_resource_type_from_string(s);
8039         if (t < 0)
8040                 return (ExecCleanMask) t;
8041
8042         return 1U << t;
8043 }
8044
8045 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_set_credential_hash_ops, char, string_hash_func, string_compare_func, ExecSetCredential, exec_set_credential_free);
8046 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_load_credential_hash_ops, char, string_hash_func, string_compare_func, ExecLoadCredential, exec_load_credential_free);
8047
8048 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
8049         [EXEC_INPUT_NULL] = "null",
8050         [EXEC_INPUT_TTY] = "tty",
8051         [EXEC_INPUT_TTY_FORCE] = "tty-force",
8052         [EXEC_INPUT_TTY_FAIL] = "tty-fail",
8053         [EXEC_INPUT_SOCKET] = "socket",
8054         [EXEC_INPUT_NAMED_FD] = "fd",
8055         [EXEC_INPUT_DATA] = "data",
8056         [EXEC_INPUT_FILE] = "file",
8057 };
8058
8059 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
8060
8061 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
8062         [EXEC_OUTPUT_INHERIT] = "inherit",
8063         [EXEC_OUTPUT_NULL] = "null",
8064         [EXEC_OUTPUT_TTY] = "tty",
8065         [EXEC_OUTPUT_KMSG] = "kmsg",
8066         [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
8067         [EXEC_OUTPUT_JOURNAL] = "journal",
8068         [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
8069         [EXEC_OUTPUT_SOCKET] = "socket",
8070         [EXEC_OUTPUT_NAMED_FD] = "fd",
8071         [EXEC_OUTPUT_FILE] = "file",
8072         [EXEC_OUTPUT_FILE_APPEND] = "append",
8073         [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
8074 };
8075
8076 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
8077
8078 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
8079         [EXEC_UTMP_INIT] = "init",
8080         [EXEC_UTMP_LOGIN] = "login",
8081         [EXEC_UTMP_USER] = "user",
8082 };
8083
8084 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
8085
8086 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
8087         [EXEC_PRESERVE_NO] = "no",
8088         [EXEC_PRESERVE_YES] = "yes",
8089         [EXEC_PRESERVE_RESTART] = "restart",
8090 };
8091
8092 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
8093
8094 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
8095 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
8096         [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
8097         [EXEC_DIRECTORY_STATE] = "StateDirectory",
8098         [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
8099         [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
8100         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
8101 };
8102
8103 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
8104
8105 /* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
8106 static const char* const exec_directory_type_symlink_table[_EXEC_DIRECTORY_TYPE_MAX] = {
8107         [EXEC_DIRECTORY_RUNTIME]       = "RuntimeDirectorySymlink",
8108         [EXEC_DIRECTORY_STATE]         = "StateDirectorySymlink",
8109         [EXEC_DIRECTORY_CACHE]         = "CacheDirectorySymlink",
8110         [EXEC_DIRECTORY_LOGS]          = "LogsDirectorySymlink",
8111         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectorySymlink",
8112 };
8113
8114 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink, ExecDirectoryType);
8115
8116 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
8117  * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
8118  * directories, specifically .timer units with their timestamp touch file. */
8119 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
8120         [EXEC_DIRECTORY_RUNTIME] = "runtime",
8121         [EXEC_DIRECTORY_STATE] = "state",
8122         [EXEC_DIRECTORY_CACHE] = "cache",
8123         [EXEC_DIRECTORY_LOGS] = "logs",
8124         [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
8125 };
8126
8127 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
8128
8129 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
8130  * the service payload in. */
8131 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
8132         [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
8133         [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
8134         [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
8135         [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
8136         [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
8137 };
8138
8139 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
8140
8141 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
8142         [EXEC_KEYRING_INHERIT] = "inherit",
8143         [EXEC_KEYRING_PRIVATE] = "private",
8144         [EXEC_KEYRING_SHARED] = "shared",
8145 };
8146
8147 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);