src/core/execute.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <errno.h>
   4 #include <fcntl.h>
   5 #include <poll.h>
   6 #include <sys/eventfd.h>
   7 #include <sys/file.h>
   8 #include <sys/ioctl.h>
   9 #include <sys/mman.h>
  10 #include <sys/mount.h>
  11 #include <sys/personality.h>
  12 #include <sys/prctl.h>
  13 #include <sys/shm.h>
  14 #include <sys/types.h>
  15 #include <sys/un.h>
  16 #include <unistd.h>
  17 #include <utmpx.h>
  18
  19 #include <linux/fs.h> /* Must be included after <sys/mount.h> */
  20
  21 #if HAVE_PAM
  22 #include <security/pam_appl.h>
  23 #endif
  24
  25 #if HAVE_SELINUX
  26 #include <selinux/selinux.h>
  27 #endif
  28
  29 #if HAVE_SECCOMP
  30 #include <seccomp.h>
  31 #endif
  32
  33 #if HAVE_APPARMOR
  34 #include <sys/apparmor.h>
  35 #endif
  36
  37 #include "sd-messages.h"
  38
  39 #include "acl-util.h"
  40 #include "af-list.h"
  41 #include "alloc-util.h"
  42 #if HAVE_APPARMOR
  43 #include "apparmor-util.h"
  44 #endif
  45 #include "argv-util.h"
  46 #include "async.h"
  47 #include "barrier.h"
  48 #include "bpf-lsm.h"
  49 #include "btrfs-util.h"
  50 #include "cap-list.h"
  51 #include "capability-util.h"
  52 #include "chattr-util.h"
  53 #include "cgroup-setup.h"
  54 #include "chase.h"
  55 #include "chown-recursive.h"
  56 #include "constants.h"
  57 #include "cpu-set-util.h"
  58 #include "creds-util.h"
  59 #include "data-fd-util.h"
  60 #include "env-file.h"
  61 #include "env-util.h"
  62 #include "errno-list.h"
  63 #include "escape.h"
  64 #include "execute.h"
  65 #include "exit-status.h"
  66 #include "fd-util.h"
  67 #include "fileio.h"
  68 #include "format-util.h"
  69 #include "glob-util.h"
  70 #include "hexdecoct.h"
  71 #include "io-util.h"
  72 #include "ioprio-util.h"
  73 #include "label-util.h"
  74 #include "lock-util.h"
  75 #include "log.h"
  76 #include "macro.h"
  77 #include "manager.h"
  78 #include "manager-dump.h"
  79 #include "memory-util.h"
  80 #include "missing_fs.h"
  81 #include "missing_ioprio.h"
  82 #include "missing_prctl.h"
  83 #include "mkdir-label.h"
  84 #include "mount-util.h"
  85 #include "mountpoint-util.h"
  86 #include "namespace.h"
  87 #include "parse-util.h"
  88 #include "path-util.h"
  89 #include "proc-cmdline.h"
  90 #include "process-util.h"
  91 #include "psi-util.h"
  92 #include "random-util.h"
  93 #include "recurse-dir.h"
  94 #include "rlimit-util.h"
  95 #include "rm-rf.h"
  96 #if HAVE_SECCOMP
  97 #include "seccomp-util.h"
  98 #endif
  99 #include "securebits-util.h"
 100 #include "selinux-util.h"
 101 #include "signal-util.h"
 102 #include "smack-util.h"
 103 #include "socket-util.h"
 104 #include "sort-util.h"
 105 #include "special.h"
 106 #include "stat-util.h"
 107 #include "string-table.h"
 108 #include "string-util.h"
 109 #include "strv.h"
 110 #include "syslog-util.h"
 111 #include "terminal-util.h"
 112 #include "tmpfile-util.h"
 113 #include "umask-util.h"
 114 #include "unit-serialize.h"
 115 #include "user-util.h"
 116 #include "utmp-wtmp.h"
 117
 118 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
 119 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
 120
 121 #define SNDBUF_SIZE (8*1024*1024)
 122
 123 static int shift_fds(int fds[], size_t n_fds) {
 124         if (n_fds <= 0)
 125                 return 0;
 126
 127         /* Modifies the fds array! (sorts it) */
 128
 129         assert(fds);
 130
 131         for (int start = 0;;) {
 132                 int restart_from = -1;
 133
 134                 for (int i = start; i < (int) n_fds; i++) {
 135                         int nfd;
 136
 137                         /* Already at right index? */
 138                         if (fds[i] == i+3)
 139                                 continue;
 140
 141                         nfd = fcntl(fds[i], F_DUPFD, i + 3);
 142                         if (nfd < 0)
 143                                 return -errno;
 144
 145                         safe_close(fds[i]);
 146                         fds[i] = nfd;
 147
 148                         /* Hmm, the fd we wanted isn't free? Then
 149                          * let's remember that and try again from here */
 150                         if (nfd != i+3 && restart_from < 0)
 151                                 restart_from = i;
 152                 }
 153
 154                 if (restart_from < 0)
 155                         break;
 156
 157                 start = restart_from;
 158         }
 159
 160         return 0;
 161 }
 162
 163 static int flags_fds(
 164                 const int fds[],
 165                 size_t n_socket_fds,
 166                 size_t n_fds,
 167                 bool nonblock) {
 168
 169         int r;
 170
 171         if (n_fds <= 0)
 172                 return 0;
 173
 174         assert(fds);
 175
 176         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
 177          * O_NONBLOCK only applies to socket activation though. */
 178
 179         for (size_t i = 0; i < n_fds; i++) {
 180
 181                 if (i < n_socket_fds) {
 182                         r = fd_nonblock(fds[i], nonblock);
 183                         if (r < 0)
 184                                 return r;
 185                 }
 186
 187                 /* We unconditionally drop FD_CLOEXEC from the fds,
 188                  * since after all we want to pass these fds to our
 189                  * children */
 190
 191                 r = fd_cloexec(fds[i], false);
 192                 if (r < 0)
 193                         return r;
 194         }
 195
 196         return 0;
 197 }
 198
 199 static const char *exec_context_tty_path(const ExecContext *context) {
 200         assert(context);
 201
 202         if (context->stdio_as_fds)
 203                 return NULL;
 204
 205         if (context->tty_path)
 206                 return context->tty_path;
 207
 208         return "/dev/console";
 209 }
 210
 211 static int exec_context_tty_size(const ExecContext *context, unsigned *ret_rows, unsigned *ret_cols) {
 212         unsigned rows, cols;
 213         const char *tty;
 214
 215         assert(context);
 216         assert(ret_rows);
 217         assert(ret_cols);
 218
 219         rows = context->tty_rows;
 220         cols = context->tty_cols;
 221
 222         tty = exec_context_tty_path(context);
 223         if (tty)
 224                 (void) proc_cmdline_tty_size(tty, rows == UINT_MAX ? &rows : NULL, cols == UINT_MAX ? &cols : NULL);
 225
 226         *ret_rows = rows;
 227         *ret_cols = cols;
 228
 229         return 0;
 230 }
 231
 232 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
 233         _cleanup_close_ int fd = -EBADF;
 234         const char *path = exec_context_tty_path(ASSERT_PTR(context));
 235
 236         /* Take a lock around the device for the duration of the setup that we do here.
 237          * systemd-vconsole-setup.service also takes the lock to avoid being interrupted.
 238          * We open a new fd that will be closed automatically, and operate on it for convenience.
 239          */
 240
 241         if (p && p->stdin_fd >= 0) {
 242                 fd = xopenat_lock(p->stdin_fd, NULL,
 243                                   O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY, 0, 0, LOCK_BSD, LOCK_EX);
 244                 if (fd < 0)
 245                         return;
 246         } else if (path) {
 247                 fd = open_terminal(path, O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK);
 248                 if (fd < 0)
 249                         return;
 250
 251                 if (lock_generic(fd, LOCK_BSD, LOCK_EX) < 0)
 252                         return;
 253         } else
 254                 return;   /* nothing to do */
 255
 256         if (context->tty_vhangup)
 257                 (void) terminal_vhangup_fd(fd);
 258
 259         if (context->tty_reset)
 260                 (void) reset_terminal_fd(fd, true);
 261
 262         if (p && p->stdin_fd >= 0) {
 263                 unsigned rows = context->tty_rows, cols = context->tty_cols;
 264
 265                 (void) exec_context_tty_size(context, &rows, &cols);
 266                 (void) terminal_set_size_fd(p->stdin_fd, path, rows, cols);
 267         }
 268
 269         if (context->tty_vt_disallocate && path)
 270                 (void) vt_disallocate(path);
 271 }
 272
 273 static bool is_terminal_input(ExecInput i) {
 274         return IN_SET(i,
 275                       EXEC_INPUT_TTY,
 276                       EXEC_INPUT_TTY_FORCE,
 277                       EXEC_INPUT_TTY_FAIL);
 278 }
 279
 280 static bool is_terminal_output(ExecOutput o) {
 281         return IN_SET(o,
 282                       EXEC_OUTPUT_TTY,
 283                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 284                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 285 }
 286
 287 static bool is_kmsg_output(ExecOutput o) {
 288         return IN_SET(o,
 289                       EXEC_OUTPUT_KMSG,
 290                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 291 }
 292
 293 static bool exec_context_needs_term(const ExecContext *c) {
 294         assert(c);
 295
 296         /* Return true if the execution context suggests we should set $TERM to something useful. */
 297
 298         if (is_terminal_input(c->std_input))
 299                 return true;
 300
 301         if (is_terminal_output(c->std_output))
 302                 return true;
 303
 304         if (is_terminal_output(c->std_error))
 305                 return true;
 306
 307         return !!c->tty_path;
 308 }
 309
 310 static int open_null_as(int flags, int nfd) {
 311         int fd;
 312
 313         assert(nfd >= 0);
 314
 315         fd = open("/dev/null", flags|O_NOCTTY);
 316         if (fd < 0)
 317                 return -errno;
 318
 319         return move_fd(fd, nfd, false);
 320 }
 321
 322 static int connect_journal_socket(
 323                 int fd,
 324                 const char *log_namespace,
 325                 uid_t uid,
 326                 gid_t gid) {
 327
 328         uid_t olduid = UID_INVALID;
 329         gid_t oldgid = GID_INVALID;
 330         const char *j;
 331         int r;
 332
 333         j = log_namespace ?
 334                 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
 335                 "/run/systemd/journal/stdout";
 336
 337         if (gid_is_valid(gid)) {
 338                 oldgid = getgid();
 339
 340                 if (setegid(gid) < 0)
 341                         return -errno;
 342         }
 343
 344         if (uid_is_valid(uid)) {
 345                 olduid = getuid();
 346
 347                 if (seteuid(uid) < 0) {
 348                         r = -errno;
 349                         goto restore_gid;
 350                 }
 351         }
 352
 353         r = connect_unix_path(fd, AT_FDCWD, j);
 354
 355         /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
 356            an LSM interferes. */
 357
 358         if (uid_is_valid(uid))
 359                 (void) seteuid(olduid);
 360
 361  restore_gid:
 362         if (gid_is_valid(gid))
 363                 (void) setegid(oldgid);
 364
 365         return r;
 366 }
 367
 368 static int connect_logger_as(
 369                 const Unit *unit,
 370                 const ExecContext *context,
 371                 const ExecParameters *params,
 372                 ExecOutput output,
 373                 const char *ident,
 374                 int nfd,
 375                 uid_t uid,
 376                 gid_t gid) {
 377
 378         _cleanup_close_ int fd = -EBADF;
 379         int r;
 380
 381         assert(context);
 382         assert(params);
 383         assert(output < _EXEC_OUTPUT_MAX);
 384         assert(ident);
 385         assert(nfd >= 0);
 386
 387         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 388         if (fd < 0)
 389                 return -errno;
 390
 391         r = connect_journal_socket(fd, context->log_namespace, uid, gid);
 392         if (r < 0)
 393                 return r;
 394
 395         if (shutdown(fd, SHUT_RD) < 0)
 396                 return -errno;
 397
 398         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 399
 400         if (dprintf(fd,
 401                 "%s\n"
 402                 "%s\n"
 403                 "%i\n"
 404                 "%i\n"
 405                 "%i\n"
 406                 "%i\n"
 407                 "%i\n",
 408                 context->syslog_identifier ?: ident,
 409                 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
 410                 context->syslog_priority,
 411                 !!context->syslog_level_prefix,
 412                 false,
 413                 is_kmsg_output(output),
 414                 is_terminal_output(output)) < 0)
 415                 return -errno;
 416
 417         return move_fd(TAKE_FD(fd), nfd, false);
 418 }
 419
 420 static int open_terminal_as(const char *path, int flags, int nfd) {
 421         int fd;
 422
 423         assert(path);
 424         assert(nfd >= 0);
 425
 426         fd = open_terminal(path, flags | O_NOCTTY);
 427         if (fd < 0)
 428                 return fd;
 429
 430         return move_fd(fd, nfd, false);
 431 }
 432
 433 static int acquire_path(const char *path, int flags, mode_t mode) {
 434         _cleanup_close_ int fd = -EBADF;
 435         int r;
 436
 437         assert(path);
 438
 439         if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
 440                 flags |= O_CREAT;
 441
 442         fd = open(path, flags|O_NOCTTY, mode);
 443         if (fd >= 0)
 444                 return TAKE_FD(fd);
 445
 446         if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
 447                 return -errno;
 448
 449         /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
 450
 451         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 452         if (fd < 0)
 453                 return -errno;
 454
 455         r = connect_unix_path(fd, AT_FDCWD, path);
 456         if (IN_SET(r, -ENOTSOCK, -EINVAL))
 457                 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
 458                  * wasn't an AF_UNIX socket after all */
 459                 return -ENXIO;
 460         if (r < 0)
 461                 return r;
 462
 463         if ((flags & O_ACCMODE) == O_RDONLY)
 464                 r = shutdown(fd, SHUT_WR);
 465         else if ((flags & O_ACCMODE) == O_WRONLY)
 466                 r = shutdown(fd, SHUT_RD);
 467         else
 468                 r = 0;
 469         if (r < 0)
 470                 return -errno;
 471
 472         return TAKE_FD(fd);
 473 }
 474
 475 static int fixup_input(
 476                 const ExecContext *context,
 477                 int socket_fd,
 478                 bool apply_tty_stdin) {
 479
 480         ExecInput std_input;
 481
 482         assert(context);
 483
 484         std_input = context->std_input;
 485
 486         if (is_terminal_input(std_input) && !apply_tty_stdin)
 487                 return EXEC_INPUT_NULL;
 488
 489         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 490                 return EXEC_INPUT_NULL;
 491
 492         if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
 493                 return EXEC_INPUT_NULL;
 494
 495         return std_input;
 496 }
 497
 498 static int fixup_output(ExecOutput output, int socket_fd) {
 499
 500         if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 501                 return EXEC_OUTPUT_INHERIT;
 502
 503         return output;
 504 }
 505
 506 static int setup_input(
 507                 const ExecContext *context,
 508                 const ExecParameters *params,
 509                 int socket_fd,
 510                 const int named_iofds[static 3]) {
 511
 512         ExecInput i;
 513         int r;
 514
 515         assert(context);
 516         assert(params);
 517         assert(named_iofds);
 518
 519         if (params->stdin_fd >= 0) {
 520                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 521                         return -errno;
 522
 523                 /* Try to make this the controlling tty, if it is a tty, and reset it */
 524                 if (isatty(STDIN_FILENO)) {
 525                         unsigned rows = context->tty_rows, cols = context->tty_cols;
 526
 527                         (void) exec_context_tty_size(context, &rows, &cols);
 528                         (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 529                         (void) reset_terminal_fd(STDIN_FILENO, true);
 530                         (void) terminal_set_size_fd(STDIN_FILENO, NULL, rows, cols);
 531                 }
 532
 533                 return STDIN_FILENO;
 534         }
 535
 536         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 537
 538         switch (i) {
 539
 540         case EXEC_INPUT_NULL:
 541                 return open_null_as(O_RDONLY, STDIN_FILENO);
 542
 543         case EXEC_INPUT_TTY:
 544         case EXEC_INPUT_TTY_FORCE:
 545         case EXEC_INPUT_TTY_FAIL: {
 546                 unsigned rows, cols;
 547                 int fd;
 548
 549                 fd = acquire_terminal(exec_context_tty_path(context),
 550                                       i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
 551                                       i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
 552                                                                   ACQUIRE_TERMINAL_WAIT,
 553                                       USEC_INFINITY);
 554                 if (fd < 0)
 555                         return fd;
 556
 557                 r = exec_context_tty_size(context, &rows, &cols);
 558                 if (r < 0)
 559                         return r;
 560
 561                 r = terminal_set_size_fd(fd, exec_context_tty_path(context), rows, cols);
 562                 if (r < 0)
 563                         return r;
 564
 565                 return move_fd(fd, STDIN_FILENO, false);
 566         }
 567
 568         case EXEC_INPUT_SOCKET:
 569                 assert(socket_fd >= 0);
 570
 571                 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
 572
 573         case EXEC_INPUT_NAMED_FD:
 574                 assert(named_iofds[STDIN_FILENO] >= 0);
 575
 576                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 577                 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
 578
 579         case EXEC_INPUT_DATA: {
 580                 int fd;
 581
 582                 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
 583                 if (fd < 0)
 584                         return fd;
 585
 586                 return move_fd(fd, STDIN_FILENO, false);
 587         }
 588
 589         case EXEC_INPUT_FILE: {
 590                 bool rw;
 591                 int fd;
 592
 593                 assert(context->stdio_file[STDIN_FILENO]);
 594
 595                 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
 596                         (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
 597
 598                 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
 599                 if (fd < 0)
 600                         return fd;
 601
 602                 return move_fd(fd, STDIN_FILENO, false);
 603         }
 604
 605         default:
 606                 assert_not_reached();
 607         }
 608 }
 609
 610 static bool can_inherit_stderr_from_stdout(
 611                 const ExecContext *context,
 612                 ExecOutput o,
 613                 ExecOutput e) {
 614
 615         assert(context);
 616
 617         /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
 618          * stderr fd */
 619
 620         if (e == EXEC_OUTPUT_INHERIT)
 621                 return true;
 622         if (e != o)
 623                 return false;
 624
 625         if (e == EXEC_OUTPUT_NAMED_FD)
 626                 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
 627
 628         if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
 629                 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
 630
 631         return true;
 632 }
 633
 634 static int setup_output(
 635                 const Unit *unit,
 636                 const ExecContext *context,
 637                 const ExecParameters *params,
 638                 int fileno,
 639                 int socket_fd,
 640                 const int named_iofds[static 3],
 641                 const char *ident,
 642                 uid_t uid,
 643                 gid_t gid,
 644                 dev_t *journal_stream_dev,
 645                 ino_t *journal_stream_ino) {
 646
 647         ExecOutput o;
 648         ExecInput i;
 649         int r;
 650
 651         assert(unit);
 652         assert(context);
 653         assert(params);
 654         assert(ident);
 655         assert(journal_stream_dev);
 656         assert(journal_stream_ino);
 657
 658         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 659
 660                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 661                         return -errno;
 662
 663                 return STDOUT_FILENO;
 664         }
 665
 666         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 667                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 668                         return -errno;
 669
 670                 return STDERR_FILENO;
 671         }
 672
 673         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 674         o = fixup_output(context->std_output, socket_fd);
 675
 676         if (fileno == STDERR_FILENO) {
 677                 ExecOutput e;
 678                 e = fixup_output(context->std_error, socket_fd);
 679
 680                 /* This expects the input and output are already set up */
 681
 682                 /* Don't change the stderr file descriptor if we inherit all
 683                  * the way and are not on a tty */
 684                 if (e == EXEC_OUTPUT_INHERIT &&
 685                     o == EXEC_OUTPUT_INHERIT &&
 686                     i == EXEC_INPUT_NULL &&
 687                     !is_terminal_input(context->std_input) &&
 688                     getppid() != 1)
 689                         return fileno;
 690
 691                 /* Duplicate from stdout if possible */
 692                 if (can_inherit_stderr_from_stdout(context, o, e))
 693                         return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
 694
 695                 o = e;
 696
 697         } else if (o == EXEC_OUTPUT_INHERIT) {
 698                 /* If input got downgraded, inherit the original value */
 699                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 700                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 701
 702                 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
 703                 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
 704                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 705
 706                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 707                 if (getppid() != 1)
 708                         return fileno;
 709
 710                 /* We need to open /dev/null here anew, to get the right access mode. */
 711                 return open_null_as(O_WRONLY, fileno);
 712         }
 713
 714         switch (o) {
 715
 716         case EXEC_OUTPUT_NULL:
 717                 return open_null_as(O_WRONLY, fileno);
 718
 719         case EXEC_OUTPUT_TTY:
 720                 if (is_terminal_input(i))
 721                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 722
 723                 /* We don't reset the terminal if this is just about output */
 724                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 725
 726         case EXEC_OUTPUT_KMSG:
 727         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 728         case EXEC_OUTPUT_JOURNAL:
 729         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 730                 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
 731                 if (r < 0) {
 732                         log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m",
 733                                                fileno == STDOUT_FILENO ? "stdout" : "stderr");
 734                         r = open_null_as(O_WRONLY, fileno);
 735                 } else {
 736                         struct stat st;
 737
 738                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 739                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 740                          * services to detect whether they are connected to the journal or not.
 741                          *
 742                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
 743                          * about STDERR as that's usually the best way to do logging. */
 744
 745                         if (fstat(fileno, &st) >= 0 &&
 746                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
 747                                 *journal_stream_dev = st.st_dev;
 748                                 *journal_stream_ino = st.st_ino;
 749                         }
 750                 }
 751                 return r;
 752
 753         case EXEC_OUTPUT_SOCKET:
 754                 assert(socket_fd >= 0);
 755
 756                 return RET_NERRNO(dup2(socket_fd, fileno));
 757
 758         case EXEC_OUTPUT_NAMED_FD:
 759                 assert(named_iofds[fileno] >= 0);
 760
 761                 (void) fd_nonblock(named_iofds[fileno], false);
 762                 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
 763
 764         case EXEC_OUTPUT_FILE:
 765         case EXEC_OUTPUT_FILE_APPEND:
 766         case EXEC_OUTPUT_FILE_TRUNCATE: {
 767                 bool rw;
 768                 int fd, flags;
 769
 770                 assert(context->stdio_file[fileno]);
 771
 772                 rw = context->std_input == EXEC_INPUT_FILE &&
 773                         streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
 774
 775                 if (rw)
 776                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 777
 778                 flags = O_WRONLY;
 779                 if (o == EXEC_OUTPUT_FILE_APPEND)
 780                         flags |= O_APPEND;
 781                 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
 782                         flags |= O_TRUNC;
 783
 784                 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
 785                 if (fd < 0)
 786                         return fd;
 787
 788                 return move_fd(fd, fileno, 0);
 789         }
 790
 791         default:
 792                 assert_not_reached();
 793         }
 794 }
 795
 796 static int chown_terminal(int fd, uid_t uid) {
 797         int r;
 798
 799         assert(fd >= 0);
 800
 801         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 802         if (isatty(fd) < 1) {
 803                 if (IN_SET(errno, EINVAL, ENOTTY))
 804                         return 0; /* not a tty */
 805
 806                 return -errno;
 807         }
 808
 809         /* This might fail. What matters are the results. */
 810         r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
 811         if (r < 0)
 812                 return r;
 813
 814         return 1;
 815 }
 816
 817 static int setup_confirm_stdio(
 818                 const ExecContext *context,
 819                 const char *vc,
 820                 int *ret_saved_stdin,
 821                 int *ret_saved_stdout) {
 822
 823         _cleanup_close_ int fd = -EBADF, saved_stdin = -EBADF, saved_stdout = -EBADF;
 824         unsigned rows, cols;
 825         int r;
 826
 827         assert(ret_saved_stdin);
 828         assert(ret_saved_stdout);
 829
 830         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 831         if (saved_stdin < 0)
 832                 return -errno;
 833
 834         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
 835         if (saved_stdout < 0)
 836                 return -errno;
 837
 838         fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
 839         if (fd < 0)
 840                 return fd;
 841
 842         r = chown_terminal(fd, getuid());
 843         if (r < 0)
 844                 return r;
 845
 846         r = reset_terminal_fd(fd, true);
 847         if (r < 0)
 848                 return r;
 849
 850         r = exec_context_tty_size(context, &rows, &cols);
 851         if (r < 0)
 852                 return r;
 853
 854         r = terminal_set_size_fd(fd, vc, rows, cols);
 855         if (r < 0)
 856                 return r;
 857
 858         r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
 859         TAKE_FD(fd);
 860         if (r < 0)
 861                 return r;
 862
 863         *ret_saved_stdin = TAKE_FD(saved_stdin);
 864         *ret_saved_stdout = TAKE_FD(saved_stdout);
 865         return 0;
 866 }
 867
 868 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
 869         assert(err < 0);
 870
 871         if (err == -ETIMEDOUT)
 872                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
 873         else {
 874                 errno = -err;
 875                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
 876         }
 877 }
 878
 879 static void write_confirm_error(int err, const char *vc, const Unit *u) {
 880         _cleanup_close_ int fd = -EBADF;
 881
 882         assert(vc);
 883
 884         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 885         if (fd < 0)
 886                 return;
 887
 888         write_confirm_error_fd(err, fd, u);
 889 }
 890
 891 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 892         int r = 0;
 893
 894         assert(saved_stdin);
 895         assert(saved_stdout);
 896
 897         release_terminal();
 898
 899         if (*saved_stdin >= 0)
 900                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 901                         r = -errno;
 902
 903         if (*saved_stdout >= 0)
 904                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 905                         r = -errno;
 906
 907         *saved_stdin = safe_close(*saved_stdin);
 908         *saved_stdout = safe_close(*saved_stdout);
 909
 910         return r;
 911 }
 912
 913 enum {
 914         CONFIRM_PRETEND_FAILURE = -1,
 915         CONFIRM_PRETEND_SUCCESS =  0,
 916         CONFIRM_EXECUTE = 1,
 917 };
 918
 919 static int ask_for_confirmation(const ExecContext *context, const char *vc, Unit *u, const char *cmdline) {
 920         int saved_stdout = -1, saved_stdin = -1, r;
 921         _cleanup_free_ char *e = NULL;
 922         char c;
 923
 924         /* For any internal errors, assume a positive response. */
 925         r = setup_confirm_stdio(context, vc, &saved_stdin, &saved_stdout);
 926         if (r < 0) {
 927                 write_confirm_error(r, vc, u);
 928                 return CONFIRM_EXECUTE;
 929         }
 930
 931         /* confirm_spawn might have been disabled while we were sleeping. */
 932         if (manager_is_confirm_spawn_disabled(u->manager)) {
 933                 r = 1;
 934                 goto restore_stdio;
 935         }
 936
 937         e = ellipsize(cmdline, 60, 100);
 938         if (!e) {
 939                 log_oom();
 940                 r = CONFIRM_EXECUTE;
 941                 goto restore_stdio;
 942         }
 943
 944         for (;;) {
 945                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 946                 if (r < 0) {
 947                         write_confirm_error_fd(r, STDOUT_FILENO, u);
 948                         r = CONFIRM_EXECUTE;
 949                         goto restore_stdio;
 950                 }
 951
 952                 switch (c) {
 953                 case 'c':
 954                         printf("Resuming normal execution.\n");
 955                         manager_disable_confirm_spawn();
 956                         r = 1;
 957                         break;
 958                 case 'D':
 959                         unit_dump(u, stdout, "  ");
 960                         continue; /* ask again */
 961                 case 'f':
 962                         printf("Failing execution.\n");
 963                         r = CONFIRM_PRETEND_FAILURE;
 964                         break;
 965                 case 'h':
 966                         printf("  c - continue, proceed without asking anymore\n"
 967                                "  D - dump, show the state of the unit\n"
 968                                "  f - fail, don't execute the command and pretend it failed\n"
 969                                "  h - help\n"
 970                                "  i - info, show a short summary of the unit\n"
 971                                "  j - jobs, show jobs that are in progress\n"
 972                                "  s - skip, don't execute the command and pretend it succeeded\n"
 973                                "  y - yes, execute the command\n");
 974                         continue; /* ask again */
 975                 case 'i':
 976                         printf("  Description: %s\n"
 977                                "  Unit:        %s\n"
 978                                "  Command:     %s\n",
 979                                u->id, u->description, cmdline);
 980                         continue; /* ask again */
 981                 case 'j':
 982                         manager_dump_jobs(u->manager, stdout, /* patterns= */ NULL, "  ");
 983                         continue; /* ask again */
 984                 case 'n':
 985                         /* 'n' was removed in favor of 'f'. */
 986                         printf("Didn't understand 'n', did you mean 'f'?\n");
 987                         continue; /* ask again */
 988                 case 's':
 989                         printf("Skipping execution.\n");
 990                         r = CONFIRM_PRETEND_SUCCESS;
 991                         break;
 992                 case 'y':
 993                         r = CONFIRM_EXECUTE;
 994                         break;
 995                 default:
 996                         assert_not_reached();
 997                 }
 998                 break;
 999         }
1000
1001 restore_stdio:
1002         restore_confirm_stdio(&saved_stdin, &saved_stdout);
1003         return r;
1004 }
1005
1006 static int get_fixed_user(const ExecContext *c, const char **user,
1007                           uid_t *uid, gid_t *gid,
1008                           const char **home, const char **shell) {
1009         int r;
1010         const char *name;
1011
1012         assert(c);
1013
1014         if (!c->user)
1015                 return 0;
1016
1017         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
1018          * (i.e. are "/" or "/bin/nologin"). */
1019
1020         name = c->user;
1021         r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
1022         if (r < 0)
1023                 return r;
1024
1025         *user = name;
1026         return 0;
1027 }
1028
1029 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
1030         int r;
1031         const char *name;
1032
1033         assert(c);
1034
1035         if (!c->group)
1036                 return 0;
1037
1038         name = c->group;
1039         r = get_group_creds(&name, gid, 0);
1040         if (r < 0)
1041                 return r;
1042
1043         *group = name;
1044         return 0;
1045 }
1046
1047 static int get_supplementary_groups(const ExecContext *c, const char *user,
1048                                     const char *group, gid_t gid,
1049                                     gid_t **supplementary_gids, int *ngids) {
1050         int r, k = 0;
1051         int ngroups_max;
1052         bool keep_groups = false;
1053         gid_t *groups = NULL;
1054         _cleanup_free_ gid_t *l_gids = NULL;
1055
1056         assert(c);
1057
1058         /*
1059          * If user is given, then lookup GID and supplementary groups list.
1060          * We avoid NSS lookups for gid=0. Also we have to initialize groups
1061          * here and as early as possible so we keep the list of supplementary
1062          * groups of the caller.
1063          */
1064         if (user && gid_is_valid(gid) && gid != 0) {
1065                 /* First step, initialize groups from /etc/groups */
1066                 if (initgroups(user, gid) < 0)
1067                         return -errno;
1068
1069                 keep_groups = true;
1070         }
1071
1072         if (strv_isempty(c->supplementary_groups))
1073                 return 0;
1074
1075         /*
1076          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1077          * be positive, otherwise fail.
1078          */
1079         errno = 0;
1080         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1081         if (ngroups_max <= 0)
1082                 return errno_or_else(EOPNOTSUPP);
1083
1084         l_gids = new(gid_t, ngroups_max);
1085         if (!l_gids)
1086                 return -ENOMEM;
1087
1088         if (keep_groups) {
1089                 /*
1090                  * Lookup the list of groups that the user belongs to, we
1091                  * avoid NSS lookups here too for gid=0.
1092                  */
1093                 k = ngroups_max;
1094                 if (getgrouplist(user, gid, l_gids, &k) < 0)
1095                         return -EINVAL;
1096         } else
1097                 k = 0;
1098
1099         STRV_FOREACH(i, c->supplementary_groups) {
1100                 const char *g;
1101
1102                 if (k >= ngroups_max)
1103                         return -E2BIG;
1104
1105                 g = *i;
1106                 r = get_group_creds(&g, l_gids+k, 0);
1107                 if (r < 0)
1108                         return r;
1109
1110                 k++;
1111         }
1112
1113         /*
1114          * Sets ngids to zero to drop all supplementary groups, happens
1115          * when we are under root and SupplementaryGroups= is empty.
1116          */
1117         if (k == 0) {
1118                 *ngids = 0;
1119                 return 0;
1120         }
1121
1122         /* Otherwise get the final list of supplementary groups */
1123         groups = memdup(l_gids, sizeof(gid_t) * k);
1124         if (!groups)
1125                 return -ENOMEM;
1126
1127         *supplementary_gids = groups;
1128         *ngids = k;
1129
1130         groups = NULL;
1131
1132         return 0;
1133 }
1134
1135 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1136         int r;
1137
1138         /* Handle SupplementaryGroups= if it is not empty */
1139         if (ngids > 0) {
1140                 r = maybe_setgroups(ngids, supplementary_gids);
1141                 if (r < 0)
1142                         return r;
1143         }
1144
1145         if (gid_is_valid(gid)) {
1146                 /* Then set our gids */
1147                 if (setresgid(gid, gid, gid) < 0)
1148                         return -errno;
1149         }
1150
1151         return 0;
1152 }
1153
1154 static int set_securebits(unsigned bits, unsigned mask) {
1155         unsigned applied;
1156         int current;
1157
1158         current = prctl(PR_GET_SECUREBITS);
1159         if (current < 0)
1160                 return -errno;
1161
1162         /* Clear all securebits defined in mask and set bits */
1163         applied = ((unsigned) current & ~mask) | bits;
1164         if ((unsigned) current == applied)
1165                 return 0;
1166
1167         if (prctl(PR_SET_SECUREBITS, applied) < 0)
1168                 return -errno;
1169
1170         return 1;
1171 }
1172
1173 static int enforce_user(
1174                 const ExecContext *context,
1175                 uid_t uid,
1176                 uint64_t capability_ambient_set) {
1177         assert(context);
1178         int r;
1179
1180         if (!uid_is_valid(uid))
1181                 return 0;
1182
1183         /* Sets (but doesn't look up) the UIS and makes sure we keep the capabilities while doing so. For
1184          * setting secure bits the capability CAP_SETPCAP is required, so we also need keep-caps in this
1185          * case. */
1186
1187         if ((capability_ambient_set != 0 || context->secure_bits != 0) && uid != 0) {
1188
1189                 /* First step: If we need to keep capabilities but drop privileges we need to make sure we
1190                  * keep our caps, while we drop privileges. Add KEEP_CAPS to the securebits */
1191                 r = set_securebits(1U << SECURE_KEEP_CAPS, 0);
1192                 if (r < 0)
1193                         return r;
1194         }
1195
1196         /* Second step: actually set the uids */
1197         if (setresuid(uid, uid, uid) < 0)
1198                 return -errno;
1199
1200         /* At this point we should have all necessary capabilities but are otherwise a normal user. However,
1201          * the caps might got corrupted due to the setresuid() so we need clean them up later. This is done
1202          * outside of this call. */
1203         return 0;
1204 }
1205
1206 #if HAVE_PAM
1207
1208 static int null_conv(
1209                 int num_msg,
1210                 const struct pam_message **msg,
1211                 struct pam_response **resp,
1212                 void *appdata_ptr) {
1213
1214         /* We don't support conversations */
1215
1216         return PAM_CONV_ERR;
1217 }
1218
1219 #endif
1220
1221 static int setup_pam(
1222                 const char *name,
1223                 const char *user,
1224                 uid_t uid,
1225                 gid_t gid,
1226                 const char *tty,
1227                 char ***env, /* updated on success */
1228                 const int fds[], size_t n_fds) {
1229
1230 #if HAVE_PAM
1231
1232         static const struct pam_conv conv = {
1233                 .conv = null_conv,
1234                 .appdata_ptr = NULL
1235         };
1236
1237         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1238         _cleanup_strv_free_ char **e = NULL;
1239         pam_handle_t *handle = NULL;
1240         sigset_t old_ss;
1241         int pam_code = PAM_SUCCESS, r;
1242         bool close_session = false;
1243         pid_t pam_pid = 0, parent_pid;
1244         int flags = 0;
1245
1246         assert(name);
1247         assert(user);
1248         assert(env);
1249
1250         /* We set up PAM in the parent process, then fork. The child
1251          * will then stay around until killed via PR_GET_PDEATHSIG or
1252          * systemd via the cgroup logic. It will then remove the PAM
1253          * session again. The parent process will exec() the actual
1254          * daemon. We do things this way to ensure that the main PID
1255          * of the daemon is the one we initially fork()ed. */
1256
1257         r = barrier_create(&barrier);
1258         if (r < 0)
1259                 goto fail;
1260
1261         if (log_get_max_level() < LOG_DEBUG)
1262                 flags |= PAM_SILENT;
1263
1264         pam_code = pam_start(name, user, &conv, &handle);
1265         if (pam_code != PAM_SUCCESS) {
1266                 handle = NULL;
1267                 goto fail;
1268         }
1269
1270         if (!tty) {
1271                 _cleanup_free_ char *q = NULL;
1272
1273                 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1274                  * out if that's the case, and read the TTY off it. */
1275
1276                 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1277                         tty = strjoina("/dev/", q);
1278         }
1279
1280         if (tty) {
1281                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1282                 if (pam_code != PAM_SUCCESS)
1283                         goto fail;
1284         }
1285
1286         STRV_FOREACH(nv, *env) {
1287                 pam_code = pam_putenv(handle, *nv);
1288                 if (pam_code != PAM_SUCCESS)
1289                         goto fail;
1290         }
1291
1292         pam_code = pam_acct_mgmt(handle, flags);
1293         if (pam_code != PAM_SUCCESS)
1294                 goto fail;
1295
1296         pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1297         if (pam_code != PAM_SUCCESS)
1298                 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
1299
1300         pam_code = pam_open_session(handle, flags);
1301         if (pam_code != PAM_SUCCESS)
1302                 goto fail;
1303
1304         close_session = true;
1305
1306         e = pam_getenvlist(handle);
1307         if (!e) {
1308                 pam_code = PAM_BUF_ERR;
1309                 goto fail;
1310         }
1311
1312         /* Block SIGTERM, so that we know that it won't get lost in the child */
1313
1314         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1315
1316         parent_pid = getpid_cached();
1317
1318         r = safe_fork("(sd-pam)", 0, &pam_pid);
1319         if (r < 0)
1320                 goto fail;
1321         if (r == 0) {
1322                 int sig, ret = EXIT_PAM;
1323
1324                 /* The child's job is to reset the PAM session on termination */
1325                 barrier_set_role(&barrier, BARRIER_CHILD);
1326
1327                 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1328                  * those fds are open here that have been opened by PAM. */
1329                 (void) close_many(fds, n_fds);
1330
1331                 /* Drop privileges - we don't need any to pam_close_session and this will make
1332                  * PR_SET_PDEATHSIG work in most cases.  If this fails, ignore the error - but expect sd-pam
1333                  * threads to fail to exit normally */
1334
1335                 r = maybe_setgroups(0, NULL);
1336                 if (r < 0)
1337                         log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1338                 if (setresgid(gid, gid, gid) < 0)
1339                         log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1340                 if (setresuid(uid, uid, uid) < 0)
1341                         log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1342
1343                 (void) ignore_signals(SIGPIPE);
1344
1345                 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1346                  * otherwise the kernel will not allow unprivileged parents kill their privileged children
1347                  * this way. We rely on the control groups kill logic to do the rest for us. */
1348                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1349                         goto child_finish;
1350
1351                 /* Tell the parent that our setup is done. This is especially important regarding dropping
1352                  * privileges. Otherwise, unit setup might race against our setresuid(2) call.
1353                  *
1354                  * If the parent aborted, we'll detect this below, hence ignore return failure here. */
1355                 (void) barrier_place(&barrier);
1356
1357                 /* Check if our parent process might already have died? */
1358                 if (getppid() == parent_pid) {
1359                         sigset_t ss;
1360
1361                         assert_se(sigemptyset(&ss) >= 0);
1362                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1363
1364                         for (;;) {
1365                                 if (sigwait(&ss, &sig) < 0) {
1366                                         if (errno == EINTR)
1367                                                 continue;
1368
1369                                         goto child_finish;
1370                                 }
1371
1372                                 assert(sig == SIGTERM);
1373                                 break;
1374                         }
1375                 }
1376
1377                 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1378                 if (pam_code != PAM_SUCCESS)
1379                         goto child_finish;
1380
1381                 /* If our parent died we'll end the session */
1382                 if (getppid() != parent_pid) {
1383                         pam_code = pam_close_session(handle, flags);
1384                         if (pam_code != PAM_SUCCESS)
1385                                 goto child_finish;
1386                 }
1387
1388                 ret = 0;
1389
1390         child_finish:
1391                 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1392                  * know about this. See pam_end(3) */
1393                 (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
1394                 _exit(ret);
1395         }
1396
1397         barrier_set_role(&barrier, BARRIER_PARENT);
1398
1399         /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1400          * here. */
1401         handle = NULL;
1402
1403         /* Unblock SIGTERM again in the parent */
1404         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1405
1406         /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1407          * this fd around. */
1408         closelog();
1409
1410         /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1411          * recover. However, warn loudly if it happens. */
1412         if (!barrier_place_and_sync(&barrier))
1413                 log_error("PAM initialization failed");
1414
1415         return strv_free_and_replace(*env, e);
1416
1417 fail:
1418         if (pam_code != PAM_SUCCESS) {
1419                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1420                 r = -EPERM;  /* PAM errors do not map to errno */
1421         } else
1422                 log_error_errno(r, "PAM failed: %m");
1423
1424         if (handle) {
1425                 if (close_session)
1426                         pam_code = pam_close_session(handle, flags);
1427
1428                 (void) pam_end(handle, pam_code | flags);
1429         }
1430
1431         closelog();
1432         return r;
1433 #else
1434         return 0;
1435 #endif
1436 }
1437
1438 static void rename_process_from_path(const char *path) {
1439         _cleanup_free_ char *buf = NULL;
1440         const char *p;
1441
1442         assert(path);
1443
1444         /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
1445          * /bin/ps */
1446
1447         if (path_extract_filename(path, &buf) < 0) {
1448                 rename_process("(...)");
1449                 return;
1450         }
1451
1452         size_t l = strlen(buf);
1453         if (l > 8) {
1454                 /* The end of the process name is usually more interesting, since the first bit might just be
1455                  * "systemd-" */
1456                 p = buf + l - 8;
1457                 l = 8;
1458         } else
1459                 p = buf;
1460
1461         char process_name[11];
1462         process_name[0] = '(';
1463         memcpy(process_name+1, p, l);
1464         process_name[1+l] = ')';
1465         process_name[1+l+1] = 0;
1466
1467         rename_process(process_name);
1468 }
1469
1470 static bool context_has_address_families(const ExecContext *c) {
1471         assert(c);
1472
1473         return c->address_families_allow_list ||
1474                 !set_isempty(c->address_families);
1475 }
1476
1477 static bool context_has_syscall_filters(const ExecContext *c) {
1478         assert(c);
1479
1480         return c->syscall_allow_list ||
1481                 !hashmap_isempty(c->syscall_filter);
1482 }
1483
1484 static bool context_has_syscall_logs(const ExecContext *c) {
1485         assert(c);
1486
1487         return c->syscall_log_allow_list ||
1488                 !hashmap_isempty(c->syscall_log);
1489 }
1490
1491 static bool context_has_no_new_privileges(const ExecContext *c) {
1492         assert(c);
1493
1494         if (c->no_new_privileges)
1495                 return true;
1496
1497         if (have_effective_cap(CAP_SYS_ADMIN) > 0) /* if we are privileged, we don't need NNP */
1498                 return false;
1499
1500         /* We need NNP if we have any form of seccomp and are unprivileged */
1501         return c->lock_personality ||
1502                 c->memory_deny_write_execute ||
1503                 c->private_devices ||
1504                 c->protect_clock ||
1505                 c->protect_hostname ||
1506                 c->protect_kernel_tunables ||
1507                 c->protect_kernel_modules ||
1508                 c->protect_kernel_logs ||
1509                 context_has_address_families(c) ||
1510                 exec_context_restrict_namespaces_set(c) ||
1511                 c->restrict_realtime ||
1512                 c->restrict_suid_sgid ||
1513                 !set_isempty(c->syscall_archs) ||
1514                 context_has_syscall_filters(c) ||
1515                 context_has_syscall_logs(c);
1516 }
1517
1518 bool exec_context_has_credentials(const ExecContext *context) {
1519
1520         assert(context);
1521
1522         return !hashmap_isempty(context->set_credentials) ||
1523                 !hashmap_isempty(context->load_credentials) ||
1524                 !set_isempty(context->import_credentials);
1525 }
1526
1527 #if HAVE_SECCOMP
1528
1529 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1530
1531         if (is_seccomp_available())
1532                 return false;
1533
1534         log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1535         return true;
1536 }
1537
1538 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1539         uint32_t negative_action, default_action, action;
1540         int r;
1541
1542         assert(u);
1543         assert(c);
1544
1545         if (!context_has_syscall_filters(c))
1546                 return 0;
1547
1548         if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1549                 return 0;
1550
1551         negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1552
1553         if (c->syscall_allow_list) {
1554                 default_action = negative_action;
1555                 action = SCMP_ACT_ALLOW;
1556         } else {
1557                 default_action = SCMP_ACT_ALLOW;
1558                 action = negative_action;
1559         }
1560
1561         if (needs_ambient_hack) {
1562                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1563                 if (r < 0)
1564                         return r;
1565         }
1566
1567         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1568 }
1569
1570 static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1571 #ifdef SCMP_ACT_LOG
1572         uint32_t default_action, action;
1573 #endif
1574
1575         assert(u);
1576         assert(c);
1577
1578         if (!context_has_syscall_logs(c))
1579                 return 0;
1580
1581 #ifdef SCMP_ACT_LOG
1582         if (skip_seccomp_unavailable(u, "SystemCallLog="))
1583                 return 0;
1584
1585         if (c->syscall_log_allow_list) {
1586                 /* Log nothing but the ones listed */
1587                 default_action = SCMP_ACT_ALLOW;
1588                 action = SCMP_ACT_LOG;
1589         } else {
1590                 /* Log everything but the ones listed */
1591                 default_action = SCMP_ACT_LOG;
1592                 action = SCMP_ACT_ALLOW;
1593         }
1594
1595         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1596 #else
1597         /* old libseccomp */
1598         log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1599         return 0;
1600 #endif
1601 }
1602
1603 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1604         assert(u);
1605         assert(c);
1606
1607         if (set_isempty(c->syscall_archs))
1608                 return 0;
1609
1610         if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1611                 return 0;
1612
1613         return seccomp_restrict_archs(c->syscall_archs);
1614 }
1615
1616 static int apply_address_families(const Unit* u, const ExecContext *c) {
1617         assert(u);
1618         assert(c);
1619
1620         if (!context_has_address_families(c))
1621                 return 0;
1622
1623         if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1624                 return 0;
1625
1626         return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1627 }
1628
1629 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1630         int r;
1631
1632         assert(u);
1633         assert(c);
1634
1635         if (!c->memory_deny_write_execute)
1636                 return 0;
1637
1638         /* use prctl() if kernel supports it (6.3) */
1639         r = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0, 0, 0);
1640         if (r == 0) {
1641                 log_unit_debug(u, "Enabled MemoryDenyWriteExecute= with PR_SET_MDWE");
1642                 return 0;
1643         }
1644         if (r < 0 && errno != EINVAL)
1645                 return log_unit_debug_errno(u, errno, "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m");
1646         /* else use seccomp */
1647         log_unit_debug(u, "Kernel doesn't support PR_SET_MDWE: falling back to seccomp");
1648
1649         if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1650                 return 0;
1651
1652         return seccomp_memory_deny_write_execute();
1653 }
1654
1655 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1656         assert(u);
1657         assert(c);
1658
1659         if (!c->restrict_realtime)
1660                 return 0;
1661
1662         if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1663                 return 0;
1664
1665         return seccomp_restrict_realtime();
1666 }
1667
1668 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1669         assert(u);
1670         assert(c);
1671
1672         if (!c->restrict_suid_sgid)
1673                 return 0;
1674
1675         if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1676                 return 0;
1677
1678         return seccomp_restrict_suid_sgid();
1679 }
1680
1681 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1682         assert(u);
1683         assert(c);
1684
1685         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1686          * let's protect even those systems where this is left on in the kernel. */
1687
1688         if (!c->protect_kernel_tunables)
1689                 return 0;
1690
1691         if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1692                 return 0;
1693
1694         return seccomp_protect_sysctl();
1695 }
1696
1697 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1698         assert(u);
1699         assert(c);
1700
1701         /* Turn off module syscalls on ProtectKernelModules=yes */
1702
1703         if (!c->protect_kernel_modules)
1704                 return 0;
1705
1706         if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1707                 return 0;
1708
1709         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1710 }
1711
1712 static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1713         assert(u);
1714         assert(c);
1715
1716         if (!c->protect_kernel_logs)
1717                 return 0;
1718
1719         if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1720                 return 0;
1721
1722         return seccomp_protect_syslog();
1723 }
1724
1725 static int apply_protect_clock(const Unit *u, const ExecContext *c) {
1726         assert(u);
1727         assert(c);
1728
1729         if (!c->protect_clock)
1730                 return 0;
1731
1732         if (skip_seccomp_unavailable(u, "ProtectClock="))
1733                 return 0;
1734
1735         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1736 }
1737
1738 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1739         assert(u);
1740         assert(c);
1741
1742         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1743
1744         if (!c->private_devices)
1745                 return 0;
1746
1747         if (skip_seccomp_unavailable(u, "PrivateDevices="))
1748                 return 0;
1749
1750         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1751 }
1752
1753 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1754         assert(u);
1755         assert(c);
1756
1757         if (!exec_context_restrict_namespaces_set(c))
1758                 return 0;
1759
1760         if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1761                 return 0;
1762
1763         return seccomp_restrict_namespaces(c->restrict_namespaces);
1764 }
1765
1766 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1767         unsigned long personality;
1768         int r;
1769
1770         assert(u);
1771         assert(c);
1772
1773         if (!c->lock_personality)
1774                 return 0;
1775
1776         if (skip_seccomp_unavailable(u, "LockPersonality="))
1777                 return 0;
1778
1779         personality = c->personality;
1780
1781         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1782         if (personality == PERSONALITY_INVALID) {
1783
1784                 r = opinionated_personality(&personality);
1785                 if (r < 0)
1786                         return r;
1787         }
1788
1789         return seccomp_lock_personality(personality);
1790 }
1791
1792 #endif
1793
1794 #if HAVE_LIBBPF
1795 static int apply_restrict_filesystems(Unit *u, const ExecContext *c) {
1796         assert(u);
1797         assert(c);
1798
1799         if (!exec_context_restrict_filesystems_set(c))
1800                 return 0;
1801
1802         if (!u->manager->restrict_fs) {
1803                 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1804                 log_unit_debug(u, "LSM BPF not supported, skipping RestrictFileSystems=");
1805                 return 0;
1806         }
1807
1808         return lsm_bpf_unit_restrict_filesystems(u, c->restrict_filesystems, c->restrict_filesystems_allow_list);
1809 }
1810 #endif
1811
1812 static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
1813         assert(u);
1814         assert(c);
1815
1816         if (!c->protect_hostname)
1817                 return 0;
1818
1819         if (ns_type_supported(NAMESPACE_UTS)) {
1820                 if (unshare(CLONE_NEWUTS) < 0) {
1821                         if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1822                                 *ret_exit_status = EXIT_NAMESPACE;
1823                                 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1824                         }
1825
1826                         log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1827                 }
1828         } else
1829                 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1830
1831 #if HAVE_SECCOMP
1832         int r;
1833
1834         if (skip_seccomp_unavailable(u, "ProtectHostname="))
1835                 return 0;
1836
1837         r = seccomp_protect_hostname();
1838         if (r < 0) {
1839                 *ret_exit_status = EXIT_SECCOMP;
1840                 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1841         }
1842 #endif
1843
1844         return 0;
1845 }
1846
1847 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1848         assert(idle_pipe);
1849
1850         idle_pipe[1] = safe_close(idle_pipe[1]);
1851         idle_pipe[2] = safe_close(idle_pipe[2]);
1852
1853         if (idle_pipe[0] >= 0) {
1854                 int r;
1855
1856                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1857
1858                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1859                         ssize_t n;
1860
1861                         /* Signal systemd that we are bored and want to continue. */
1862                         n = write(idle_pipe[3], "x", 1);
1863                         if (n > 0)
1864                                 /* Wait for systemd to react to the signal above. */
1865                                 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1866                 }
1867
1868                 idle_pipe[0] = safe_close(idle_pipe[0]);
1869
1870         }
1871
1872         idle_pipe[3] = safe_close(idle_pipe[3]);
1873 }
1874
1875 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1876
1877 static int build_environment(
1878                 const Unit *u,
1879                 const ExecContext *c,
1880                 const ExecParameters *p,
1881                 const CGroupContext *cgroup_context,
1882                 size_t n_fds,
1883                 char **fdnames,
1884                 const char *home,
1885                 const char *username,
1886                 const char *shell,
1887                 dev_t journal_stream_dev,
1888                 ino_t journal_stream_ino,
1889                 const char *memory_pressure_path,
1890                 char ***ret) {
1891
1892         _cleanup_strv_free_ char **our_env = NULL;
1893         size_t n_env = 0;
1894         char *x;
1895         int r;
1896
1897         assert(u);
1898         assert(c);
1899         assert(p);
1900         assert(ret);
1901
1902 #define N_ENV_VARS 19
1903         our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1904         if (!our_env)
1905                 return -ENOMEM;
1906
1907         if (n_fds > 0) {
1908                 _cleanup_free_ char *joined = NULL;
1909
1910                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1911                         return -ENOMEM;
1912                 our_env[n_env++] = x;
1913
1914                 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1915                         return -ENOMEM;
1916                 our_env[n_env++] = x;
1917
1918                 joined = strv_join(fdnames, ":");
1919                 if (!joined)
1920                         return -ENOMEM;
1921
1922                 x = strjoin("LISTEN_FDNAMES=", joined);
1923                 if (!x)
1924                         return -ENOMEM;
1925                 our_env[n_env++] = x;
1926         }
1927
1928         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1929                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1930                         return -ENOMEM;
1931                 our_env[n_env++] = x;
1932
1933                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1934                         return -ENOMEM;
1935                 our_env[n_env++] = x;
1936         }
1937
1938         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1939          * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1940          * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1941         if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
1942                 x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
1943                 if (!x)
1944                         return -ENOMEM;
1945                 our_env[n_env++] = x;
1946         }
1947
1948         if (home) {
1949                 x = strjoin("HOME=", home);
1950                 if (!x)
1951                         return -ENOMEM;
1952
1953                 path_simplify(x + 5);
1954                 our_env[n_env++] = x;
1955         }
1956
1957         if (username) {
1958                 x = strjoin("LOGNAME=", username);
1959                 if (!x)
1960                         return -ENOMEM;
1961                 our_env[n_env++] = x;
1962
1963                 x = strjoin("USER=", username);
1964                 if (!x)
1965                         return -ENOMEM;
1966                 our_env[n_env++] = x;
1967         }
1968
1969         if (shell) {
1970                 x = strjoin("SHELL=", shell);
1971                 if (!x)
1972                         return -ENOMEM;
1973
1974                 path_simplify(x + 6);
1975                 our_env[n_env++] = x;
1976         }
1977
1978         if (!sd_id128_is_null(u->invocation_id)) {
1979                 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1980                         return -ENOMEM;
1981
1982                 our_env[n_env++] = x;
1983         }
1984
1985         if (exec_context_needs_term(c)) {
1986                 _cleanup_free_ char *cmdline = NULL;
1987                 const char *tty_path, *term = NULL;
1988
1989                 tty_path = exec_context_tty_path(c);
1990
1991                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1992                  * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1993                  * container manager passes to PID 1 ends up all the way in the console login shown. */
1994
1995                 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
1996                         term = getenv("TERM");
1997                 else if (tty_path && in_charset(skip_dev_prefix(tty_path), ALPHANUMERICAL)) {
1998                         _cleanup_free_ char *key = NULL;
1999
2000                         key = strjoin("systemd.tty.term.", skip_dev_prefix(tty_path));
2001                         if (!key)
2002                                 return -ENOMEM;
2003
2004                         r = proc_cmdline_get_key(key, 0, &cmdline);
2005                         if (r < 0)
2006                                 log_debug_errno(r, "Failed to read %s from kernel cmdline, ignoring: %m", key);
2007                         else if (r > 0)
2008                                 term = cmdline;
2009                 }
2010
2011                 if (!term)
2012                         term = default_term_for_tty(tty_path);
2013
2014                 x = strjoin("TERM=", term);
2015                 if (!x)
2016                         return -ENOMEM;
2017                 our_env[n_env++] = x;
2018         }
2019
2020         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
2021                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
2022                         return -ENOMEM;
2023
2024                 our_env[n_env++] = x;
2025         }
2026
2027         if (c->log_namespace) {
2028                 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
2029                 if (!x)
2030                         return -ENOMEM;
2031
2032                 our_env[n_env++] = x;
2033         }
2034
2035         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2036                 _cleanup_free_ char *joined = NULL;
2037                 const char *n;
2038
2039                 if (!p->prefix[t])
2040                         continue;
2041
2042                 if (c->directories[t].n_items == 0)
2043                         continue;
2044
2045                 n = exec_directory_env_name_to_string(t);
2046                 if (!n)
2047                         continue;
2048
2049                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
2050                         _cleanup_free_ char *prefixed = NULL;
2051
2052                         prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
2053                         if (!prefixed)
2054                                 return -ENOMEM;
2055
2056                         if (!strextend_with_separator(&joined, ":", prefixed))
2057                                 return -ENOMEM;
2058                 }
2059
2060                 x = strjoin(n, "=", joined);
2061                 if (!x)
2062                         return -ENOMEM;
2063
2064                 our_env[n_env++] = x;
2065         }
2066
2067         if (exec_context_has_credentials(c) && p->prefix[EXEC_DIRECTORY_RUNTIME]) {
2068                 x = strjoin("CREDENTIALS_DIRECTORY=", p->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
2069                 if (!x)
2070                         return -ENOMEM;
2071
2072                 our_env[n_env++] = x;
2073         }
2074
2075         if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
2076                 return -ENOMEM;
2077
2078         our_env[n_env++] = x;
2079
2080         if (memory_pressure_path) {
2081                 x = strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path);
2082                 if (!x)
2083                         return -ENOMEM;
2084
2085                 our_env[n_env++] = x;
2086
2087                 if (cgroup_context && !path_equal(memory_pressure_path, "/dev/null")) {
2088                         _cleanup_free_ char *b = NULL, *e = NULL;
2089
2090                         if (asprintf(&b, "%s " USEC_FMT " " USEC_FMT,
2091                                      MEMORY_PRESSURE_DEFAULT_TYPE,
2092                                      cgroup_context->memory_pressure_threshold_usec == USEC_INFINITY ? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC :
2093                                      CLAMP(cgroup_context->memory_pressure_threshold_usec, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC),
2094                                      MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2095                                 return -ENOMEM;
2096
2097                         if (base64mem(b, strlen(b) + 1, &e) < 0)
2098                                 return -ENOMEM;
2099
2100                         x = strjoin("MEMORY_PRESSURE_WRITE=", e);
2101                         if (!x)
2102                                 return -ENOMEM;
2103
2104                         our_env[n_env++] = x;
2105                 }
2106         }
2107
2108         assert(n_env < N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
2109 #undef N_ENV_VARS
2110
2111         *ret = TAKE_PTR(our_env);
2112
2113         return 0;
2114 }
2115
2116 static int build_pass_environment(const ExecContext *c, char ***ret) {
2117         _cleanup_strv_free_ char **pass_env = NULL;
2118         size_t n_env = 0;
2119
2120         STRV_FOREACH(i, c->pass_environment) {
2121                 _cleanup_free_ char *x = NULL;
2122                 char *v;
2123
2124                 v = getenv(*i);
2125                 if (!v)
2126                         continue;
2127                 x = strjoin(*i, "=", v);
2128                 if (!x)
2129                         return -ENOMEM;
2130
2131                 if (!GREEDY_REALLOC(pass_env, n_env + 2))
2132                         return -ENOMEM;
2133
2134                 pass_env[n_env++] = TAKE_PTR(x);
2135                 pass_env[n_env] = NULL;
2136         }
2137
2138         *ret = TAKE_PTR(pass_env);
2139
2140         return 0;
2141 }
2142
2143 bool exec_needs_network_namespace(const ExecContext *context) {
2144         assert(context);
2145
2146         return context->private_network || context->network_namespace_path;
2147 }
2148
2149 static bool exec_needs_ephemeral(const ExecContext *context) {
2150         return (context->root_image || context->root_directory) && context->root_ephemeral;
2151 }
2152
2153 static bool exec_needs_ipc_namespace(const ExecContext *context) {
2154         assert(context);
2155
2156         return context->private_ipc || context->ipc_namespace_path;
2157 }
2158
2159 bool exec_needs_mount_namespace(
2160                 const ExecContext *context,
2161                 const ExecParameters *params,
2162                 const ExecRuntime *runtime) {
2163
2164         assert(context);
2165
2166         if (context->root_image)
2167                 return true;
2168
2169         if (!strv_isempty(context->read_write_paths) ||
2170             !strv_isempty(context->read_only_paths) ||
2171             !strv_isempty(context->inaccessible_paths) ||
2172             !strv_isempty(context->exec_paths) ||
2173             !strv_isempty(context->no_exec_paths))
2174                 return true;
2175
2176         if (context->n_bind_mounts > 0)
2177                 return true;
2178
2179         if (context->n_temporary_filesystems > 0)
2180                 return true;
2181
2182         if (context->n_mount_images > 0)
2183                 return true;
2184
2185         if (context->n_extension_images > 0)
2186                 return true;
2187
2188         if (!strv_isempty(context->extension_directories))
2189                 return true;
2190
2191         if (!IN_SET(context->mount_propagation_flag, 0, MS_SHARED))
2192                 return true;
2193
2194         if (context->private_tmp && runtime && runtime->shared && (runtime->shared->tmp_dir || runtime->shared->var_tmp_dir))
2195                 return true;
2196
2197         if (context->private_devices ||
2198             context->private_mounts > 0 ||
2199             (context->private_mounts < 0 && exec_needs_network_namespace(context)) ||
2200             context->protect_system != PROTECT_SYSTEM_NO ||
2201             context->protect_home != PROTECT_HOME_NO ||
2202             context->protect_kernel_tunables ||
2203             context->protect_kernel_modules ||
2204             context->protect_kernel_logs ||
2205             context->protect_control_groups ||
2206             context->protect_proc != PROTECT_PROC_DEFAULT ||
2207             context->proc_subset != PROC_SUBSET_ALL ||
2208             exec_needs_ipc_namespace(context))
2209                 return true;
2210
2211         if (context->root_directory) {
2212                 if (exec_context_get_effective_mount_apivfs(context))
2213                         return true;
2214
2215                 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2216                         if (params && !params->prefix[t])
2217                                 continue;
2218
2219                         if (context->directories[t].n_items > 0)
2220                                 return true;
2221                 }
2222         }
2223
2224         if (context->dynamic_user &&
2225             (context->directories[EXEC_DIRECTORY_STATE].n_items > 0 ||
2226              context->directories[EXEC_DIRECTORY_CACHE].n_items > 0 ||
2227              context->directories[EXEC_DIRECTORY_LOGS].n_items > 0))
2228                 return true;
2229
2230         if (context->log_namespace)
2231                 return true;
2232
2233         return false;
2234 }
2235
2236 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
2237         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2238         _cleanup_close_pair_ int errno_pipe[2] = PIPE_EBADF;
2239         _cleanup_close_ int unshare_ready_fd = -EBADF;
2240         _cleanup_(sigkill_waitp) pid_t pid = 0;
2241         uint64_t c = 1;
2242         ssize_t n;
2243         int r;
2244
2245         /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2246          * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2247          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2248          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2249          * which waits for the parent to create the new user namespace while staying in the original namespace. The
2250          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2251          * continues execution normally.
2252          * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2253          * does not need CAP_SETUID to write the single line mapping to itself. */
2254
2255         /* Can only set up multiple mappings with CAP_SETUID. */
2256         if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid))
2257                 r = asprintf(&uid_map,
2258                              UID_FMT " " UID_FMT " 1\n"     /* Map $OUID → $OUID */
2259                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
2260                              ouid, ouid, uid, uid);
2261         else
2262                 r = asprintf(&uid_map,
2263                              UID_FMT " " UID_FMT " 1\n",    /* Map $OUID → $OUID */
2264                              ouid, ouid);
2265
2266         if (r < 0)
2267                 return -ENOMEM;
2268
2269         /* Can only set up multiple mappings with CAP_SETGID. */
2270         if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid))
2271                 r = asprintf(&gid_map,
2272                              GID_FMT " " GID_FMT " 1\n"     /* Map $OGID → $OGID */
2273                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
2274                              ogid, ogid, gid, gid);
2275         else
2276                 r = asprintf(&gid_map,
2277                              GID_FMT " " GID_FMT " 1\n",    /* Map $OGID -> $OGID */
2278                              ogid, ogid);
2279
2280         if (r < 0)
2281                 return -ENOMEM;
2282
2283         /* Create a communication channel so that the parent can tell the child when it finished creating the user
2284          * namespace. */
2285         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2286         if (unshare_ready_fd < 0)
2287                 return -errno;
2288
2289         /* Create a communication channel so that the child can tell the parent a proper error code in case it
2290          * failed. */
2291         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2292                 return -errno;
2293
2294         r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2295         if (r < 0)
2296                 return r;
2297         if (r == 0) {
2298                 _cleanup_close_ int fd = -EBADF;
2299                 const char *a;
2300                 pid_t ppid;
2301
2302                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2303                  * here, after the parent opened its own user namespace. */
2304
2305                 ppid = getppid();
2306                 errno_pipe[0] = safe_close(errno_pipe[0]);
2307
2308                 /* Wait until the parent unshared the user namespace */
2309                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2310                         r = -errno;
2311                         goto child_fail;
2312                 }
2313
2314                 /* Disable the setgroups() system call in the child user namespace, for good. */
2315                 a = procfs_file_alloca(ppid, "setgroups");
2316                 fd = open(a, O_WRONLY|O_CLOEXEC);
2317                 if (fd < 0) {
2318                         if (errno != ENOENT) {
2319                                 r = -errno;
2320                                 goto child_fail;
2321                         }
2322
2323                         /* If the file is missing the kernel is too old, let's continue anyway. */
2324                 } else {
2325                         if (write(fd, "deny\n", 5) < 0) {
2326                                 r = -errno;
2327                                 goto child_fail;
2328                         }
2329
2330                         fd = safe_close(fd);
2331                 }
2332
2333                 /* First write the GID map */
2334                 a = procfs_file_alloca(ppid, "gid_map");
2335                 fd = open(a, O_WRONLY|O_CLOEXEC);
2336                 if (fd < 0) {
2337                         r = -errno;
2338                         goto child_fail;
2339                 }
2340                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2341                         r = -errno;
2342                         goto child_fail;
2343                 }
2344                 fd = safe_close(fd);
2345
2346                 /* The write the UID map */
2347                 a = procfs_file_alloca(ppid, "uid_map");
2348                 fd = open(a, O_WRONLY|O_CLOEXEC);
2349                 if (fd < 0) {
2350                         r = -errno;
2351                         goto child_fail;
2352                 }
2353                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2354                         r = -errno;
2355                         goto child_fail;
2356                 }
2357
2358                 _exit(EXIT_SUCCESS);
2359
2360         child_fail:
2361                 (void) write(errno_pipe[1], &r, sizeof(r));
2362                 _exit(EXIT_FAILURE);
2363         }
2364
2365         errno_pipe[1] = safe_close(errno_pipe[1]);
2366
2367         if (unshare(CLONE_NEWUSER) < 0)
2368                 return -errno;
2369
2370         /* Let the child know that the namespace is ready now */
2371         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2372                 return -errno;
2373
2374         /* Try to read an error code from the child */
2375         n = read(errno_pipe[0], &r, sizeof(r));
2376         if (n < 0)
2377                 return -errno;
2378         if (n == sizeof(r)) { /* an error code was sent to us */
2379                 if (r < 0)
2380                         return r;
2381                 return -EIO;
2382         }
2383         if (n != 0) /* on success we should have read 0 bytes */
2384                 return -EIO;
2385
2386         r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
2387         if (r < 0)
2388                 return r;
2389         if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2390                 return -EIO;
2391
2392         return 0;
2393 }
2394
2395 static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2396         assert(context);
2397
2398         if (!context->dynamic_user)
2399                 return false;
2400
2401         if (type == EXEC_DIRECTORY_CONFIGURATION)
2402                 return false;
2403
2404         if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2405                 return false;
2406
2407         return true;
2408 }
2409
2410 static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2411         _cleanup_free_ char *src_abs = NULL;
2412         int r;
2413
2414         assert(source);
2415
2416         src_abs = path_join(root, source);
2417         if (!src_abs)
2418                 return -ENOMEM;
2419
2420         STRV_FOREACH(dst, symlinks) {
2421                 _cleanup_free_ char *dst_abs = NULL;
2422
2423                 dst_abs = path_join(root, *dst);
2424                 if (!dst_abs)
2425                         return -ENOMEM;
2426
2427                 r = mkdir_parents_label(dst_abs, 0755);
2428                 if (r < 0)
2429                         return r;
2430
2431                 r = symlink_idempotent(src_abs, dst_abs, true);
2432                 if (r < 0)
2433                         return r;
2434         }
2435
2436         return 0;
2437 }
2438
2439 static int setup_exec_directory(
2440                 Unit *u,
2441                 const ExecContext *context,
2442                 const ExecParameters *params,
2443                 uid_t uid,
2444                 gid_t gid,
2445                 ExecDirectoryType type,
2446                 bool needs_mount_namespace,
2447                 int *exit_status) {
2448
2449         static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2450                 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2451                 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2452                 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2453                 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2454                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2455         };
2456         int r;
2457
2458         assert(context);
2459         assert(params);
2460         assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2461         assert(exit_status);
2462
2463         if (!params->prefix[type])
2464                 return 0;
2465
2466         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2467                 if (!uid_is_valid(uid))
2468                         uid = 0;
2469                 if (!gid_is_valid(gid))
2470                         gid = 0;
2471         }
2472
2473         for (size_t i = 0; i < context->directories[type].n_items; i++) {
2474                 _cleanup_free_ char *p = NULL, *pp = NULL;
2475
2476                 p = path_join(params->prefix[type], context->directories[type].items[i].path);
2477                 if (!p) {
2478                         r = -ENOMEM;
2479                         goto fail;
2480                 }
2481
2482                 r = mkdir_parents_label(p, 0755);
2483                 if (r < 0)
2484                         goto fail;
2485
2486                 if (IN_SET(type, EXEC_DIRECTORY_STATE, EXEC_DIRECTORY_LOGS) && params->runtime_scope == RUNTIME_SCOPE_USER) {
2487
2488                         /* If we are in user mode, and a configuration directory exists but a state directory
2489                          * doesn't exist, then we likely are upgrading from an older systemd version that
2490                          * didn't know the more recent addition to the xdg-basedir spec: the $XDG_STATE_HOME
2491                          * directory. In older systemd versions EXEC_DIRECTORY_STATE was aliased to
2492                          * EXEC_DIRECTORY_CONFIGURATION, with the advent of $XDG_STATE_HOME is is now
2493                          * separated. If a service has both dirs configured but only the configuration dir
2494                          * exists and the state dir does not, we assume we are looking at an update
2495                          * situation. Hence, create a compatibility symlink, so that all expectations are
2496                          * met.
2497                          *
2498                          * (We also do something similar with the log directory, which still doesn't exist in
2499                          * the xdg basedir spec. We'll make it a subdir of the state dir.) */
2500
2501                         /* this assumes the state dir is always created before the configuration dir */
2502                         assert_cc(EXEC_DIRECTORY_STATE < EXEC_DIRECTORY_LOGS);
2503                         assert_cc(EXEC_DIRECTORY_LOGS < EXEC_DIRECTORY_CONFIGURATION);
2504
2505                         r = laccess(p, F_OK);
2506                         if (r == -ENOENT) {
2507                                 _cleanup_free_ char *q = NULL;
2508
2509                                 /* OK, we know that the state dir does not exist. Let's see if the dir exists
2510                                  * under the configuration hierarchy. */
2511
2512                                 if (type == EXEC_DIRECTORY_STATE)
2513                                         q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], context->directories[type].items[i].path);
2514                                 else if (type == EXEC_DIRECTORY_LOGS)
2515                                         q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], "log", context->directories[type].items[i].path);
2516                                 else
2517                                         assert_not_reached();
2518                                 if (!q) {
2519                                         r = -ENOMEM;
2520                                         goto fail;
2521                                 }
2522
2523                                 r = laccess(q, F_OK);
2524                                 if (r >= 0) {
2525                                         /* It does exist! This hence looks like an update. Symlink the
2526                                          * configuration directory into the state directory. */
2527
2528                                         r = symlink_idempotent(q, p, /* make_relative= */ true);
2529                                         if (r < 0)
2530                                                 goto fail;
2531
2532                                         log_unit_notice(u, "Unit state directory %s missing but matching configuration directory %s exists, assuming update from systemd 253 or older, creating compatibility symlink.", p, q);
2533                                         continue;
2534                                 } else if (r != -ENOENT)
2535                                         log_unit_warning_errno(u, r, "Unable to detect whether unit configuration directory '%s' exists, assuming not: %m", q);
2536
2537                         } else if (r < 0)
2538                                 log_unit_warning_errno(u, r, "Unable to detect whether unit state directory '%s' is missing, assuming it is: %m", p);
2539                 }
2540
2541                 if (exec_directory_is_private(context, type)) {
2542                         /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2543                          * case we want to avoid leaving a directory around fully accessible that is owned by
2544                          * a dynamic user whose UID is later on reused. To lock this down we use the same
2545                          * trick used by container managers to prohibit host users to get access to files of
2546                          * the same UID in containers: we place everything inside a directory that has an
2547                          * access mode of 0700 and is owned root:root, so that it acts as security boundary
2548                          * for unprivileged host code. We then use fs namespacing to make this directory
2549                          * permeable for the service itself.
2550                          *
2551                          * Specifically: for a service which wants a special directory "foo/" we first create
2552                          * a directory "private/" with access mode 0700 owned by root:root. Then we place
2553                          * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2554                          * "private/foo". This way, privileged host users can access "foo/" as usual, but
2555                          * unprivileged host users can't look into it. Inside of the namespace of the unit
2556                          * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2557                          * "private/foo/" is mounted under the same name, thus disabling the access boundary
2558                          * for the service and making sure it only gets access to the dirs it needs but no
2559                          * others. Tricky? Yes, absolutely, but it works!
2560                          *
2561                          * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2562                          * to be owned by the service itself.
2563                          *
2564                          * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2565                          * for sharing files or sockets with other services. */
2566
2567                         pp = path_join(params->prefix[type], "private");
2568                         if (!pp) {
2569                                 r = -ENOMEM;
2570                                 goto fail;
2571                         }
2572
2573                         /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2574                         r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
2575                         if (r < 0)
2576                                 goto fail;
2577
2578                         if (!path_extend(&pp, context->directories[type].items[i].path)) {
2579                                 r = -ENOMEM;
2580                                 goto fail;
2581                         }
2582
2583                         /* Create all directories between the configured directory and this private root, and mark them 0755 */
2584                         r = mkdir_parents_label(pp, 0755);
2585                         if (r < 0)
2586                                 goto fail;
2587
2588                         if (is_dir(p, false) > 0 &&
2589                             (laccess(pp, F_OK) == -ENOENT)) {
2590
2591                                 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2592                                  * it over. Most likely the service has been upgraded from one that didn't use
2593                                  * DynamicUser=1, to one that does. */
2594
2595                                 log_unit_info(u, "Found pre-existing public %s= directory %s, migrating to %s.\n"
2596                                               "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2597                                               exec_directory_type_to_string(type), p, pp);
2598
2599                                 r = RET_NERRNO(rename(p, pp));
2600                                 if (r < 0)
2601                                         goto fail;
2602                         } else {
2603                                 /* Otherwise, create the actual directory for the service */
2604
2605                                 r = mkdir_label(pp, context->directories[type].mode);
2606                                 if (r < 0 && r != -EEXIST)
2607                                         goto fail;
2608                         }
2609
2610                         if (!context->directories[type].items[i].only_create) {
2611                                 /* And link it up from the original place.
2612                                  * Notes
2613                                  * 1) If a mount namespace is going to be used, then this symlink remains on
2614                                  *    the host, and a new one for the child namespace will be created later.
2615                                  * 2) It is not necessary to create this symlink when one of its parent
2616                                  *    directories is specified and already created. E.g.
2617                                  *        StateDirectory=foo foo/bar
2618                                  *    In that case, the inode points to pp and p for "foo/bar" are the same:
2619                                  *        pp = "/var/lib/private/foo/bar"
2620                                  *        p = "/var/lib/foo/bar"
2621                                  *    and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2622                                  *    we do not need to create the symlink, but we cannot create the symlink.
2623                                  *    See issue #24783. */
2624                                 r = symlink_idempotent(pp, p, true);
2625                                 if (r < 0)
2626                                         goto fail;
2627                         }
2628
2629                 } else {
2630                         _cleanup_free_ char *target = NULL;
2631
2632                         if (type != EXEC_DIRECTORY_CONFIGURATION &&
2633                             readlink_and_make_absolute(p, &target) >= 0) {
2634                                 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2635
2636                                 /* This already exists and is a symlink? Interesting. Maybe it's one created
2637                                  * by DynamicUser=1 (see above)?
2638                                  *
2639                                  * We do this for all directory types except for ConfigurationDirectory=,
2640                                  * since they all support the private/ symlink logic at least in some
2641                                  * configurations, see above. */
2642
2643                                 r = chase(target, NULL, 0, &target_resolved, NULL);
2644                                 if (r < 0)
2645                                         goto fail;
2646
2647                                 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
2648                                 if (!q) {
2649                                         r = -ENOMEM;
2650                                         goto fail;
2651                                 }
2652
2653                                 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2654                                 r = chase(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2655                                 if (r < 0)
2656                                         goto fail;
2657
2658                                 if (path_equal(q_resolved, target_resolved)) {
2659
2660                                         /* Hmm, apparently DynamicUser= was once turned on for this service,
2661                                          * but is no longer. Let's move the directory back up. */
2662
2663                                         log_unit_info(u, "Found pre-existing private %s= directory %s, migrating to %s.\n"
2664                                                       "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2665                                                       exec_directory_type_to_string(type), q, p);
2666
2667                                         r = RET_NERRNO(unlink(p));
2668                                         if (r < 0)
2669                                                 goto fail;
2670
2671                                         r = RET_NERRNO(rename(q, p));
2672                                         if (r < 0)
2673                                                 goto fail;
2674                                 }
2675                         }
2676
2677                         r = mkdir_label(p, context->directories[type].mode);
2678                         if (r < 0) {
2679                                 if (r != -EEXIST)
2680                                         goto fail;
2681
2682                                 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2683                                         struct stat st;
2684
2685                                         /* Don't change the owner/access mode of the configuration directory,
2686                                          * as in the common case it is not written to by a service, and shall
2687                                          * not be writable. */
2688
2689                                         r = RET_NERRNO(stat(p, &st));
2690                                         if (r < 0)
2691                                                 goto fail;
2692
2693                                         /* Still complain if the access mode doesn't match */
2694                                         if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2695                                                 log_unit_warning(u, "%s \'%s\' already exists but the mode is different. "
2696                                                                  "(File system: %o %sMode: %o)",
2697                                                                  exec_directory_type_to_string(type), context->directories[type].items[i].path,
2698                                                                  st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2699
2700                                         continue;
2701                                 }
2702                         }
2703                 }
2704
2705                 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2706                  * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2707                  * current UID/GID ownership.) */
2708                 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2709                 if (r < 0)
2710                         goto fail;
2711
2712                 /* Skip the rest (which deals with ownership) in user mode, since ownership changes are not
2713                  * available to user code anyway */
2714                 if (params->runtime_scope != RUNTIME_SCOPE_SYSTEM)
2715                         continue;
2716
2717                 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2718                  * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2719                  * assignments to exist. */
2720                 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777, AT_SYMLINK_FOLLOW);
2721                 if (r < 0)
2722                         goto fail;
2723         }
2724
2725         /* If we are not going to run in a namespace, set up the symlinks - otherwise
2726          * they are set up later, to allow configuring empty var/run/etc. */
2727         if (!needs_mount_namespace)
2728                 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2729                         r = create_many_symlinks(params->prefix[type],
2730                                                  context->directories[type].items[i].path,
2731                                                  context->directories[type].items[i].symlinks);
2732                         if (r < 0)
2733                                 goto fail;
2734                 }
2735
2736         return 0;
2737
2738 fail:
2739         *exit_status = exit_status_table[type];
2740         return r;
2741 }
2742
2743 static int write_credential(
2744                 int dfd,
2745                 const char *id,
2746                 const void *data,
2747                 size_t size,
2748                 uid_t uid,
2749                 gid_t gid,
2750                 bool ownership_ok) {
2751
2752         _cleanup_(unlink_and_freep) char *tmp = NULL;
2753         _cleanup_close_ int fd = -EBADF;
2754         int r;
2755
2756         r = tempfn_random_child("", "cred", &tmp);
2757         if (r < 0)
2758                 return r;
2759
2760         fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
2761         if (fd < 0) {
2762                 tmp = mfree(tmp);
2763                 return -errno;
2764         }
2765
2766         r = loop_write(fd, data, size, /* do_poll = */ false);
2767         if (r < 0)
2768                 return r;
2769
2770         if (fchmod(fd, 0400) < 0) /* Take away "w" bit */
2771                 return -errno;
2772
2773         if (uid_is_valid(uid) && uid != getuid()) {
2774                 r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
2775                 if (r < 0) {
2776                         if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2777                                 return r;
2778
2779                         if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want
2780                                             * to express: that the user gets read access and nothing
2781                                             * else. But if the backing fs can't support that (e.g. ramfs)
2782                                             * then we can use file ownership instead. But that's only safe if
2783                                             * we can then re-mount the whole thing read-only, so that the
2784                                             * user can no longer chmod() the file to gain write access. */
2785                                 return r;
2786
2787                         if (fchown(fd, uid, gid) < 0)
2788                                 return -errno;
2789                 }
2790         }
2791
2792         if (renameat(dfd, tmp, dfd, id) < 0)
2793                 return -errno;
2794
2795         tmp = mfree(tmp);
2796         return 0;
2797 }
2798
2799 typedef enum CredentialSearchPath {
2800         CREDENTIAL_SEARCH_PATH_TRUSTED,
2801         CREDENTIAL_SEARCH_PATH_ENCRYPTED,
2802         CREDENTIAL_SEARCH_PATH_ALL,
2803         _CREDENTIAL_SEARCH_PATH_MAX,
2804         _CREDENTIAL_SEARCH_PATH_INVALID = -EINVAL,
2805 } CredentialSearchPath;
2806
2807 static char **credential_search_path(const ExecParameters *params, CredentialSearchPath path) {
2808
2809         _cleanup_strv_free_ char **l = NULL;
2810
2811         assert(params);
2812         assert(path >= 0 && path < _CREDENTIAL_SEARCH_PATH_MAX);
2813
2814         /* Assemble a search path to find credentials in. For non-encrypted credentials, We'll look in
2815          * /etc/credstore/ (and similar directories in /usr/lib/ + /run/). If we're looking for encrypted
2816          * credentials, we'll look in /etc/credstore.encrypted/ (and similar dirs). */
2817
2818         if (IN_SET(path, CREDENTIAL_SEARCH_PATH_ENCRYPTED, CREDENTIAL_SEARCH_PATH_ALL)) {
2819                 if (strv_extend(&l, params->received_encrypted_credentials_directory) < 0)
2820                         return NULL;
2821
2822                 if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore.encrypted"), /* filter_duplicates= */ true) < 0)
2823                         return NULL;
2824         }
2825
2826         if (IN_SET(path, CREDENTIAL_SEARCH_PATH_TRUSTED, CREDENTIAL_SEARCH_PATH_ALL)) {
2827                 if (params->received_credentials_directory)
2828                         if (strv_extend(&l, params->received_credentials_directory) < 0)
2829                                 return NULL;
2830
2831                 if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore"), /* filter_duplicates= */ true) < 0)
2832                         return NULL;
2833         }
2834
2835         if (DEBUG_LOGGING) {
2836                 _cleanup_free_ char *t = strv_join(l, ":");
2837
2838                 log_debug("Credential search path is: %s", strempty(t));
2839         }
2840
2841         return TAKE_PTR(l);
2842 }
2843
2844 static int maybe_decrypt_and_write_credential(
2845                 int dir_fd,
2846                 const char *id,
2847                 bool encrypted,
2848                 uid_t uid,
2849                 gid_t gid,
2850                 bool ownership_ok,
2851                 const char *data,
2852                 size_t size,
2853                 uint64_t *left) {
2854
2855         _cleanup_free_ void *plaintext = NULL;
2856         size_t add;
2857         int r;
2858
2859         if (encrypted) {
2860                 size_t plaintext_size = 0;
2861
2862                 r = decrypt_credential_and_warn(id, now(CLOCK_REALTIME), NULL, NULL, data, size,
2863                                                 &plaintext, &plaintext_size);
2864                 if (r < 0)
2865                         return r;
2866
2867                 data = plaintext;
2868                 size = plaintext_size;
2869         }
2870
2871         add = strlen(id) + size;
2872         if (add > *left)
2873                 return -E2BIG;
2874
2875         r = write_credential(dir_fd, id, data, size, uid, gid, ownership_ok);
2876         if (r < 0)
2877                 return log_debug_errno(r, "Failed to write credential '%s': %m", id);
2878
2879         *left -= add;
2880         return 0;
2881 }
2882
2883 static int load_credential_glob(
2884                 const char *path,
2885                 bool encrypted,
2886                 char **search_path,
2887                 ReadFullFileFlags flags,
2888                 int write_dfd,
2889                 uid_t uid,
2890                 gid_t gid,
2891                 bool ownership_ok,
2892                 uint64_t *left) {
2893
2894         int r;
2895
2896         STRV_FOREACH(d, search_path) {
2897                 _cleanup_globfree_ glob_t pglob = {};
2898                 _cleanup_free_ char *j = NULL;
2899
2900                 j = path_join(*d, path);
2901                 if (!j)
2902                         return -ENOMEM;
2903
2904                 r = safe_glob(j, 0, &pglob);
2905                 if (r == -ENOENT)
2906                         continue;
2907                 if (r < 0)
2908                         return r;
2909
2910                 for (size_t n = 0; n < pglob.gl_pathc; n++) {
2911                         _cleanup_free_ char *fn = NULL;
2912                         _cleanup_(erase_and_freep) char *data = NULL;
2913                         size_t size;
2914
2915                         /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
2916                         r = read_full_file_full(
2917                                 AT_FDCWD,
2918                                 pglob.gl_pathv[n],
2919                                 UINT64_MAX,
2920                                 encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX,
2921                                 flags,
2922                                 NULL,
2923                                 &data, &size);
2924                         if (r < 0)
2925                                 return log_debug_errno(r, "Failed to read credential '%s': %m",
2926                                                         pglob.gl_pathv[n]);
2927
2928                         r = path_extract_filename(pglob.gl_pathv[n], &fn);
2929                         if (r < 0)
2930                                 return log_debug_errno(r, "Failed to extract filename from '%s': %m",
2931                                                         pglob.gl_pathv[n]);
2932
2933                         r = maybe_decrypt_and_write_credential(
2934                                 write_dfd,
2935                                 fn,
2936                                 encrypted,
2937                                 uid,
2938                                 gid,
2939                                 ownership_ok,
2940                                 data, size,
2941                                 left);
2942                         if (r == -EEXIST)
2943                                 continue;
2944                         if (r < 0)
2945                                 return r;
2946                 }
2947         }
2948
2949         return 0;
2950 }
2951
2952 static int load_credential(
2953                 const ExecContext *context,
2954                 const ExecParameters *params,
2955                 const char *id,
2956                 const char *path,
2957                 bool encrypted,
2958                 const char *unit,
2959                 int read_dfd,
2960                 int write_dfd,
2961                 uid_t uid,
2962                 gid_t gid,
2963                 bool ownership_ok,
2964                 uint64_t *left) {
2965
2966         ReadFullFileFlags flags = READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER;
2967         _cleanup_strv_free_ char **search_path = NULL;
2968         _cleanup_(erase_and_freep) char *data = NULL;
2969         _cleanup_free_ char *bindname = NULL;
2970         const char *source = NULL;
2971         bool missing_ok = true;
2972         size_t size, maxsz;
2973         int r;
2974
2975         assert(context);
2976         assert(params);
2977         assert(id);
2978         assert(path);
2979         assert(unit);
2980         assert(read_dfd >= 0 || read_dfd == AT_FDCWD);
2981         assert(write_dfd >= 0);
2982         assert(left);
2983
2984         if (read_dfd >= 0) {
2985                 /* If a directory fd is specified, then read the file directly from that dir. In this case we
2986                  * won't do AF_UNIX stuff (we simply don't want to recursively iterate down a tree of AF_UNIX
2987                  * IPC sockets). It's OK if a file vanishes here in the time we enumerate it and intend to
2988                  * open it. */
2989
2990                 if (!filename_is_valid(path)) /* safety check */
2991                         return -EINVAL;
2992
2993                 missing_ok = true;
2994                 source = path;
2995
2996         } else if (path_is_absolute(path)) {
2997                 /* If this is an absolute path, read the data directly from it, and support AF_UNIX
2998                  * sockets */
2999
3000                 if (!path_is_valid(path)) /* safety check */
3001                         return -EINVAL;
3002
3003                 flags |= READ_FULL_FILE_CONNECT_SOCKET;
3004
3005                 /* Pass some minimal info about the unit and the credential name we are looking to acquire
3006                  * via the source socket address in case we read off an AF_UNIX socket. */
3007                 if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, id) < 0)
3008                         return -ENOMEM;
3009
3010                 missing_ok = false;
3011                 source = path;
3012
3013         } else if (credential_name_valid(path)) {
3014                 /* If this is a relative path, take it as credential name relative to the credentials
3015                  * directory we received ourselves. We don't support the AF_UNIX stuff in this mode, since we
3016                  * are operating on a credential store, i.e. this is guaranteed to be regular files. */
3017
3018                 search_path = credential_search_path(params, CREDENTIAL_SEARCH_PATH_ALL);
3019                 if (!search_path)
3020                         return -ENOMEM;
3021
3022                 missing_ok = true;
3023         } else
3024                 source = NULL;
3025
3026         if (encrypted)
3027                 flags |= READ_FULL_FILE_UNBASE64;
3028
3029         maxsz = encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX;
3030
3031         if (search_path) {
3032                 STRV_FOREACH(d, search_path) {
3033                         _cleanup_free_ char *j = NULL;
3034
3035                         j = path_join(*d, path);
3036                         if (!j)
3037                                 return -ENOMEM;
3038
3039                         r = read_full_file_full(
3040                                         AT_FDCWD, j, /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
3041                                         UINT64_MAX,
3042                                         maxsz,
3043                                         flags,
3044                                         NULL,
3045                                         &data, &size);
3046                         if (r != -ENOENT)
3047                                 break;
3048                 }
3049         } else if (source)
3050                 r = read_full_file_full(
3051                                 read_dfd, source,
3052                                 UINT64_MAX,
3053                                 maxsz,
3054                                 flags,
3055                                 bindname,
3056                                 &data, &size);
3057         else
3058                 r = -ENOENT;
3059
3060         if (r == -ENOENT && (missing_ok || hashmap_contains(context->set_credentials, id))) {
3061                 /* Make a missing inherited credential non-fatal, let's just continue. After all apps
3062                  * will get clear errors if we don't pass such a missing credential on as they
3063                  * themselves will get ENOENT when trying to read them, which should not be much
3064                  * worse than when we handle the error here and make it fatal.
3065                  *
3066                  * Also, if the source file doesn't exist, but a fallback is set via SetCredentials=
3067                  * we are fine, too. */
3068                 log_debug_errno(r, "Couldn't read inherited credential '%s', skipping: %m", path);
3069                 return 0;
3070         }
3071         if (r < 0)
3072                 return log_debug_errno(r, "Failed to read credential '%s': %m", path);
3073
3074         return maybe_decrypt_and_write_credential(write_dfd, id, encrypted, uid, gid, ownership_ok, data, size, left);
3075 }
3076
3077 struct load_cred_args {
3078         const ExecContext *context;
3079         const ExecParameters *params;
3080         bool encrypted;
3081         const char *unit;
3082         int dfd;
3083         uid_t uid;
3084         gid_t gid;
3085         bool ownership_ok;
3086         uint64_t *left;
3087 };
3088
3089 static int load_cred_recurse_dir_cb(
3090                 RecurseDirEvent event,
3091                 const char *path,
3092                 int dir_fd,
3093                 int inode_fd,
3094                 const struct dirent *de,
3095                 const struct statx *sx,
3096                 void *userdata) {
3097
3098         struct load_cred_args *args = ASSERT_PTR(userdata);
3099         _cleanup_free_ char *sub_id = NULL;
3100         int r;
3101
3102         if (event != RECURSE_DIR_ENTRY)
3103                 return RECURSE_DIR_CONTINUE;
3104
3105         if (!IN_SET(de->d_type, DT_REG, DT_SOCK))
3106                 return RECURSE_DIR_CONTINUE;
3107
3108         sub_id = strreplace(path, "/", "_");
3109         if (!sub_id)
3110                 return -ENOMEM;
3111
3112         if (!credential_name_valid(sub_id))
3113                 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Credential would get ID %s, which is not valid, refusing", sub_id);
3114
3115         if (faccessat(args->dfd, sub_id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0) {
3116                 log_debug("Skipping credential with duplicated ID %s at %s", sub_id, path);
3117                 return RECURSE_DIR_CONTINUE;
3118         }
3119         if (errno != ENOENT)
3120                 return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sub_id);
3121
3122         r = load_credential(
3123                         args->context,
3124                         args->params,
3125                         sub_id,
3126                         de->d_name,
3127                         args->encrypted,
3128                         args->unit,
3129                         dir_fd,
3130                         args->dfd,
3131                         args->uid,
3132                         args->gid,
3133                         args->ownership_ok,
3134                         args->left);
3135         if (r < 0)
3136                 return r;
3137
3138         return RECURSE_DIR_CONTINUE;
3139 }
3140
3141 static int acquire_credentials(
3142                 const ExecContext *context,
3143                 const ExecParameters *params,
3144                 const char *unit,
3145                 const char *p,
3146                 uid_t uid,
3147                 gid_t gid,
3148                 bool ownership_ok) {
3149
3150         uint64_t left = CREDENTIALS_TOTAL_SIZE_MAX;
3151         _cleanup_close_ int dfd = -EBADF;
3152         const char *ic;
3153         ExecLoadCredential *lc;
3154         ExecSetCredential *sc;
3155         int r;
3156
3157         assert(context);
3158         assert(p);
3159
3160         dfd = open(p, O_DIRECTORY|O_CLOEXEC);
3161         if (dfd < 0)
3162                 return -errno;
3163
3164         r = fd_acl_make_writable(dfd); /* Add the "w" bit, if we are reusing an already set up credentials dir where it was unset */
3165         if (r < 0)
3166                 return r;
3167
3168         /* First, load credentials off disk (or acquire via AF_UNIX socket) */
3169         HASHMAP_FOREACH(lc, context->load_credentials) {
3170                 _cleanup_close_ int sub_fd = -EBADF;
3171
3172                 /* If this is an absolute path, then try to open it as a directory. If that works, then we'll
3173                  * recurse into it. If it is an absolute path but it isn't a directory, then we'll open it as
3174                  * a regular file. Finally, if it's a relative path we will use it as a credential name to
3175                  * propagate a credential passed to us from further up. */
3176
3177                 if (path_is_absolute(lc->path)) {
3178                         sub_fd = open(lc->path, O_DIRECTORY|O_CLOEXEC|O_RDONLY);
3179                         if (sub_fd < 0 && !IN_SET(errno,
3180                                                   ENOTDIR,  /* Not a directory */
3181                                                   ENOENT))  /* Doesn't exist? */
3182                                 return log_debug_errno(errno, "Failed to open '%s': %m", lc->path);
3183                 }
3184
3185                 if (sub_fd < 0)
3186                         /* Regular file (incl. a credential passed in from higher up) */
3187                         r = load_credential(
3188                                         context,
3189                                         params,
3190                                         lc->id,
3191                                         lc->path,
3192                                         lc->encrypted,
3193                                         unit,
3194                                         AT_FDCWD,
3195                                         dfd,
3196                                         uid,
3197                                         gid,
3198                                         ownership_ok,
3199                                         &left);
3200                 else
3201                         /* Directory */
3202                         r = recurse_dir(
3203                                         sub_fd,
3204                                         /* path= */ lc->id, /* recurse_dir() will suffix the subdir paths from here to the top-level id */
3205                                         /* statx_mask= */ 0,
3206                                         /* n_depth_max= */ UINT_MAX,
3207                                         RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT|RECURSE_DIR_ENSURE_TYPE,
3208                                         load_cred_recurse_dir_cb,
3209                                         &(struct load_cred_args) {
3210                                                 .context = context,
3211                                                 .params = params,
3212                                                 .encrypted = lc->encrypted,
3213                                                 .unit = unit,
3214                                                 .dfd = dfd,
3215                                                 .uid = uid,
3216                                                 .gid = gid,
3217                                                 .ownership_ok = ownership_ok,
3218                                                 .left = &left,
3219                                         });
3220                 if (r < 0)
3221                         return r;
3222         }
3223
3224         /* Next, look for system credentials and credentials in the credentials store. Note that these do not
3225          * override any credentials found earlier. */
3226         SET_FOREACH(ic, context->import_credentials) {
3227                 _cleanup_free_ char **search_path = NULL;
3228
3229                 search_path = credential_search_path(params, CREDENTIAL_SEARCH_PATH_TRUSTED);
3230                 if (!search_path)
3231                         return -ENOMEM;
3232
3233                 r = load_credential_glob(
3234                                 ic,
3235                                 /* encrypted = */ false,
3236                                 search_path,
3237                                 READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER,
3238                                 dfd,
3239                                 uid,
3240                                 gid,
3241                                 ownership_ok,
3242                                 &left);
3243                 if (r < 0)
3244                         return r;
3245
3246                 search_path = strv_free(search_path);
3247                 search_path = credential_search_path(params, CREDENTIAL_SEARCH_PATH_ENCRYPTED);
3248                 if (!search_path)
3249                         return -ENOMEM;
3250
3251                 r = load_credential_glob(
3252                                 ic,
3253                                 /* encrypted = */ true,
3254                                 search_path,
3255                                 READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER|READ_FULL_FILE_UNBASE64,
3256                                 dfd,
3257                                 uid,
3258                                 gid,
3259                                 ownership_ok,
3260                                 &left);
3261                 if (r < 0)
3262                         return r;
3263         }
3264
3265         /* Finally, we add in literally specified credentials. If the credentials already exist, we'll not
3266          * add them, so that they can act as a "default" if the same credential is specified multiple times. */
3267         HASHMAP_FOREACH(sc, context->set_credentials) {
3268                 _cleanup_(erase_and_freep) void *plaintext = NULL;
3269                 const char *data;
3270                 size_t size, add;
3271
3272                 /* Note that we check ahead of time here instead of relying on O_EXCL|O_CREAT later to return
3273                  * EEXIST if the credential already exists. That's because the TPM2-based decryption is kinda
3274                  * slow and involved, hence it's nice to be able to skip that if the credential already
3275                  * exists anyway. */
3276                 if (faccessat(dfd, sc->id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0)
3277                         continue;
3278                 if (errno != ENOENT)
3279                         return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sc->id);
3280
3281                 if (sc->encrypted) {
3282                         r = decrypt_credential_and_warn(sc->id, now(CLOCK_REALTIME), NULL, NULL, sc->data, sc->size, &plaintext, &size);
3283                         if (r < 0)
3284                                 return r;
3285
3286                         data = plaintext;
3287                 } else {
3288                         data = sc->data;
3289                         size = sc->size;
3290                 }
3291
3292                 add = strlen(sc->id) + size;
3293                 if (add > left)
3294                         return -E2BIG;
3295
3296                 r = write_credential(dfd, sc->id, data, size, uid, gid, ownership_ok);
3297                 if (r < 0)
3298                         return r;
3299
3300                 left -= add;
3301         }
3302
3303         r = fd_acl_make_read_only(dfd); /* Now take away the "w" bit */
3304         if (r < 0)
3305                 return r;
3306
3307         /* After we created all keys with the right perms, also make sure the credential store as a whole is
3308          * accessible */
3309
3310         if (uid_is_valid(uid) && uid != getuid()) {
3311                 r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE);
3312                 if (r < 0) {
3313                         if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
3314                                 return r;
3315
3316                         if (!ownership_ok)
3317                                 return r;
3318
3319                         if (fchown(dfd, uid, gid) < 0)
3320                                 return -errno;
3321                 }
3322         }
3323
3324         return 0;
3325 }
3326
3327 static int setup_credentials_internal(
3328                 const ExecContext *context,
3329                 const ExecParameters *params,
3330                 const char *unit,
3331                 const char *final,        /* This is where the credential store shall eventually end up at */
3332                 const char *workspace,    /* This is where we can prepare it before moving it to the final place */
3333                 bool reuse_workspace,     /* Whether to reuse any existing workspace mount if it already is a mount */
3334                 bool must_mount,          /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
3335                 uid_t uid,
3336                 gid_t gid) {
3337
3338         int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
3339                                    * if we mounted something; false if we definitely can't mount anything */
3340         bool final_mounted;
3341         const char *where;
3342
3343         assert(context);
3344         assert(final);
3345         assert(workspace);
3346
3347         if (reuse_workspace) {
3348                 r = path_is_mount_point(workspace, NULL, 0);
3349                 if (r < 0)
3350                         return r;
3351                 if (r > 0)
3352                         workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse it, let's keep this in mind */
3353                 else
3354                         workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
3355         } else
3356                 workspace_mounted = -1; /* ditto */
3357
3358         r = path_is_mount_point(final, NULL, 0);
3359         if (r < 0)
3360                 return r;
3361         if (r > 0) {
3362                 /* If the final place already has something mounted, we use that. If the workspace also has
3363                  * something mounted we assume it's actually the same mount (but with MS_RDONLY
3364                  * different). */
3365                 final_mounted = true;
3366
3367                 if (workspace_mounted < 0) {
3368                         /* If the final place is mounted, but the workspace isn't, then let's bind mount
3369                          * the final version to the workspace, and make it writable, so that we can make
3370                          * changes */
3371
3372                         r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
3373                         if (r < 0)
3374                                 return r;
3375
3376                         r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false), NULL);
3377                         if (r < 0)
3378                                 return r;
3379
3380                         workspace_mounted = true;
3381                 }
3382         } else
3383                 final_mounted = false;
3384
3385         if (workspace_mounted < 0) {
3386                 /* Nothing is mounted on the workspace yet, let's try to mount something now */
3387
3388                 r = mount_credentials_fs(workspace, CREDENTIALS_TOTAL_SIZE_MAX, /* ro= */ false);
3389                 if (r < 0) {
3390                         /* If that didn't work, try to make a bind mount from the final to the workspace, so that we can make it writable there. */
3391                         r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
3392                         if (r < 0) {
3393                                 if (!ERRNO_IS_PRIVILEGE(r)) /* Propagate anything that isn't a permission problem */
3394                                         return r;
3395
3396                                 if (must_mount) /* If we it's not OK to use the plain directory
3397                                                  * fallback, propagate all errors too */
3398                                         return r;
3399
3400                                 /* If we lack privileges to bind mount stuff, then let's gracefully
3401                                  * proceed for compat with container envs, and just use the final dir
3402                                  * as is. */
3403
3404                                 workspace_mounted = false;
3405                         } else {
3406                                 /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
3407                                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false), NULL);
3408                                 if (r < 0)
3409                                         return r;
3410
3411                                 workspace_mounted = true;
3412                         }
3413                 } else
3414                         workspace_mounted = true;
3415         }
3416
3417         assert(!must_mount || workspace_mounted > 0);
3418         where = workspace_mounted ? workspace : final;
3419
3420         (void) label_fix_full(AT_FDCWD, where, final, 0);
3421
3422         r = acquire_credentials(context, params, unit, where, uid, gid, workspace_mounted);
3423         if (r < 0)
3424                 return r;
3425
3426         if (workspace_mounted) {
3427                 bool install;
3428
3429                 /* Determine if we should actually install the prepared mount in the final location by bind
3430                  * mounting it there. We do so only if the mount is not established there already, and if the
3431                  * mount is actually non-empty (i.e. carries at least one credential). Not that in the best
3432                  * case we are doing all this in a mount namespace, thus no one else will see that we
3433                  * allocated a file system we are getting rid of again here. */
3434                 if (final_mounted)
3435                         install = false; /* already installed */
3436                 else {
3437                         r = dir_is_empty(where, /* ignore_hidden_or_backup= */ false);
3438                         if (r < 0)
3439                                 return r;
3440
3441                         install = r == 0; /* install only if non-empty */
3442                 }
3443
3444                 if (install) {
3445                         /* Make workspace read-only now, so that any bind mount we make from it defaults to read-only too */
3446                         r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ true), NULL);
3447                         if (r < 0)
3448                                 return r;
3449
3450                         /* And mount it to the final place, read-only */
3451                         r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
3452                 } else
3453                         /* Otherwise get rid of it */
3454                         r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
3455                 if (r < 0)
3456                         return r;
3457         } else {
3458                 _cleanup_free_ char *parent = NULL;
3459
3460                 /* If we do not have our own mount put used the plain directory fallback, then we need to
3461                  * open access to the top-level credential directory and the per-service directory now */
3462
3463                 r = path_extract_directory(final, &parent);
3464                 if (r < 0)
3465                         return r;
3466                 if (chmod(parent, 0755) < 0)
3467                         return -errno;
3468         }
3469
3470         return 0;
3471 }
3472
3473 static int setup_credentials(
3474                 const ExecContext *context,
3475                 const ExecParameters *params,
3476                 const char *unit,
3477                 uid_t uid,
3478                 gid_t gid) {
3479
3480         _cleanup_free_ char *p = NULL, *q = NULL;
3481         int r;
3482
3483         assert(context);
3484         assert(params);
3485
3486         if (!exec_context_has_credentials(context))
3487                 return 0;
3488
3489         if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
3490                 return -EINVAL;
3491
3492         /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
3493          * and the subdir we mount over with a read-only file system readable by the service's user */
3494         q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials");
3495         if (!q)
3496                 return -ENOMEM;
3497
3498         r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */
3499         if (r < 0 && r != -EEXIST)
3500                 return r;
3501
3502         p = path_join(q, unit);
3503         if (!p)
3504                 return -ENOMEM;
3505
3506         r = mkdir_label(p, 0700); /* per-unit dir: private to user */
3507         if (r < 0 && r != -EEXIST)
3508                 return r;
3509
3510         r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
3511         if (r < 0) {
3512                 _cleanup_free_ char *t = NULL, *u = NULL;
3513
3514                 /* If this is not a privilege or support issue then propagate the error */
3515                 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
3516                         return r;
3517
3518                 /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
3519                  * it into place, so that users can't access half-initialized credential stores. */
3520                 t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
3521                 if (!t)
3522                         return -ENOMEM;
3523
3524                 /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
3525                  * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
3526                  * after it is fully set up */
3527                 u = path_join(t, unit);
3528                 if (!u)
3529                         return -ENOMEM;
3530
3531                 FOREACH_STRING(i, t, u) {
3532                         r = mkdir_label(i, 0700);
3533                         if (r < 0 && r != -EEXIST)
3534                                 return r;
3535                 }
3536
3537                 r = setup_credentials_internal(
3538                                 context,
3539                                 params,
3540                                 unit,
3541                                 p,       /* final mount point */
3542                                 u,       /* temporary workspace to overmount */
3543                                 true,    /* reuse the workspace if it is already a mount */
3544                                 false,   /* it's OK to fall back to a plain directory if we can't mount anything */
3545                                 uid,
3546                                 gid);
3547
3548                 (void) rmdir(u); /* remove the workspace again if we can. */
3549
3550                 if (r < 0)
3551                         return r;
3552
3553         } else if (r == 0) {
3554
3555                 /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
3556                  * we can use the same directory for all cases, after turning off propagation. Question
3557                  * though is: where do we turn off propagation exactly, and where do we place the workspace
3558                  * directory? We need some place that is guaranteed to be a mount point in the host, and
3559                  * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
3560                  * since we ultimately want to move the resulting file system there, i.e. we need propagation
3561                  * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
3562                  * would be visible in the host mount table all the time, which we want to avoid. Hence, what
3563                  * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
3564                  * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
3565                  * propagation on the former, and then overmount the latter.
3566                  *
3567                  * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
3568                  * for this purpose, but there are few other candidates that work equally well for us, and
3569                  * given that the we do this in a privately namespaced short-lived single-threaded process
3570                  * that no one else sees this should be OK to do. */
3571
3572                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL); /* Turn off propagation from our namespace to host */
3573                 if (r < 0)
3574                         goto child_fail;
3575
3576                 r = setup_credentials_internal(
3577                                 context,
3578                                 params,
3579                                 unit,
3580                                 p,           /* final mount point */
3581                                 "/dev/shm",  /* temporary workspace to overmount */
3582                                 false,       /* do not reuse /dev/shm if it is already a mount, under no circumstances */
3583                                 true,        /* insist that something is mounted, do not allow fallback to plain directory */
3584                                 uid,
3585                                 gid);
3586                 if (r < 0)
3587                         goto child_fail;
3588
3589                 _exit(EXIT_SUCCESS);
3590
3591         child_fail:
3592                 _exit(EXIT_FAILURE);
3593         }
3594
3595         /* If the credentials dir is empty and not a mount point, then there's no point in having it. Let's
3596          * try to remove it. This matters in particular if we created the dir as mount point but then didn't
3597          * actually end up mounting anything on it. In that case we'd rather have ENOENT than EACCESS being
3598          * seen by users when trying access this inode. */
3599         (void) rmdir(p);
3600         return 0;
3601 }
3602
3603 #if ENABLE_SMACK
3604 static int setup_smack(
3605                 const Manager *manager,
3606                 const ExecContext *context,
3607                 int executable_fd) {
3608         int r;
3609
3610         assert(context);
3611         assert(executable_fd >= 0);
3612
3613         if (context->smack_process_label) {
3614                 r = mac_smack_apply_pid(0, context->smack_process_label);
3615                 if (r < 0)
3616                         return r;
3617         } else if (manager->default_smack_process_label) {
3618                 _cleanup_free_ char *exec_label = NULL;
3619
3620                 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
3621                 if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
3622                         return r;
3623
3624                 r = mac_smack_apply_pid(0, exec_label ?: manager->default_smack_process_label);
3625                 if (r < 0)
3626                         return r;
3627         }
3628
3629         return 0;
3630 }
3631 #endif
3632
3633 static int compile_bind_mounts(
3634                 const ExecContext *context,
3635                 const ExecParameters *params,
3636                 BindMount **ret_bind_mounts,
3637                 size_t *ret_n_bind_mounts,
3638                 char ***ret_empty_directories) {
3639
3640         _cleanup_strv_free_ char **empty_directories = NULL;
3641         BindMount *bind_mounts = NULL;
3642         size_t n, h = 0;
3643         int r;
3644
3645         assert(context);
3646         assert(params);
3647         assert(ret_bind_mounts);
3648         assert(ret_n_bind_mounts);
3649         assert(ret_empty_directories);
3650
3651         CLEANUP_ARRAY(bind_mounts, h, bind_mount_free_many);
3652
3653         n = context->n_bind_mounts;
3654         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3655                 if (!params->prefix[t])
3656                         continue;
3657
3658                 for (size_t i = 0; i < context->directories[t].n_items; i++)
3659                         n += !context->directories[t].items[i].only_create;
3660         }
3661
3662         if (n <= 0) {
3663                 *ret_bind_mounts = NULL;
3664                 *ret_n_bind_mounts = 0;
3665                 *ret_empty_directories = NULL;
3666                 return 0;
3667         }
3668
3669         bind_mounts = new(BindMount, n);
3670         if (!bind_mounts)
3671                 return -ENOMEM;
3672
3673         for (size_t i = 0; i < context->n_bind_mounts; i++) {
3674                 BindMount *item = context->bind_mounts + i;
3675                 _cleanup_free_ char *s = NULL, *d = NULL;
3676
3677                 s = strdup(item->source);
3678                 if (!s)
3679                         return -ENOMEM;
3680
3681                 d = strdup(item->destination);
3682                 if (!d)
3683                         return -ENOMEM;
3684
3685                 bind_mounts[h++] = (BindMount) {
3686                         .source = TAKE_PTR(s),
3687                         .destination = TAKE_PTR(d),
3688                         .read_only = item->read_only,
3689                         .recursive = item->recursive,
3690                         .ignore_enoent = item->ignore_enoent,
3691                 };
3692         }
3693
3694         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3695                 if (!params->prefix[t])
3696                         continue;
3697
3698                 if (context->directories[t].n_items == 0)
3699                         continue;
3700
3701                 if (exec_directory_is_private(context, t) &&
3702                     !exec_context_with_rootfs(context)) {
3703                         char *private_root;
3704
3705                         /* So this is for a dynamic user, and we need to make sure the process can access its own
3706                          * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3707                          * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3708
3709                         private_root = path_join(params->prefix[t], "private");
3710                         if (!private_root)
3711                                 return -ENOMEM;
3712
3713                         r = strv_consume(&empty_directories, private_root);
3714                         if (r < 0)
3715                                 return r;
3716                 }
3717
3718                 for (size_t i = 0; i < context->directories[t].n_items; i++) {
3719                         _cleanup_free_ char *s = NULL, *d = NULL;
3720
3721                         /* When one of the parent directories is in the list, we cannot create the symlink
3722                          * for the child directory. See also the comments in setup_exec_directory(). */
3723                         if (context->directories[t].items[i].only_create)
3724                                 continue;
3725
3726                         if (exec_directory_is_private(context, t))
3727                                 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
3728                         else
3729                                 s = path_join(params->prefix[t], context->directories[t].items[i].path);
3730                         if (!s)
3731                                 return -ENOMEM;
3732
3733                         if (exec_directory_is_private(context, t) &&
3734                             exec_context_with_rootfs(context))
3735                                 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3736                                  * directory is not created on the root directory. So, let's bind-mount the directory
3737                                  * on the 'non-private' place. */
3738                                 d = path_join(params->prefix[t], context->directories[t].items[i].path);
3739                         else
3740                                 d = strdup(s);
3741                         if (!d)
3742                                 return -ENOMEM;
3743
3744                         bind_mounts[h++] = (BindMount) {
3745                                 .source = TAKE_PTR(s),
3746                                 .destination = TAKE_PTR(d),
3747                                 .read_only = false,
3748                                 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
3749                                 .recursive = true,
3750                                 .ignore_enoent = false,
3751                         };
3752                 }
3753         }
3754
3755         assert(h == n);
3756
3757         *ret_bind_mounts = TAKE_PTR(bind_mounts);
3758         *ret_n_bind_mounts = n;
3759         *ret_empty_directories = TAKE_PTR(empty_directories);
3760
3761         return (int) n;
3762 }
3763
3764 /* ret_symlinks will contain a list of pairs src:dest that describes
3765  * the symlinks to create later on. For example, the symlinks needed
3766  * to safely give private directories to DynamicUser=1 users. */
3767 static int compile_symlinks(
3768                 const ExecContext *context,
3769                 const ExecParameters *params,
3770                 bool setup_os_release_symlink,
3771                 char ***ret_symlinks) {
3772
3773         _cleanup_strv_free_ char **symlinks = NULL;
3774         int r;
3775
3776         assert(context);
3777         assert(params);
3778         assert(ret_symlinks);
3779
3780         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3781                 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
3782                         _cleanup_free_ char *private_path = NULL, *path = NULL;
3783
3784                         STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
3785                                 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
3786
3787                                 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3788                                 dst_abs = path_join(params->prefix[dt], *symlink);
3789                                 if (!src_abs || !dst_abs)
3790                                         return -ENOMEM;
3791
3792                                 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
3793                                 if (r < 0)
3794                                         return r;
3795                         }
3796
3797                         if (!exec_directory_is_private(context, dt) ||
3798                             exec_context_with_rootfs(context) ||
3799                             context->directories[dt].items[i].only_create)
3800                                 continue;
3801
3802                         private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
3803                         if (!private_path)
3804                                 return -ENOMEM;
3805
3806                         path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3807                         if (!path)
3808                                 return -ENOMEM;
3809
3810                         r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
3811                         if (r < 0)
3812                                 return r;
3813                 }
3814         }
3815
3816         /* We make the host's os-release available via a symlink, so that we can copy it atomically
3817          * and readers will never get a half-written version. Note that, while the paths specified here are
3818          * absolute, when they are processed in namespace.c they will be made relative automatically, i.e.:
3819          * 'os-release -> .os-release-stage/os-release' is what will be created. */
3820         if (setup_os_release_symlink) {
3821                 r = strv_extend(&symlinks, "/run/host/.os-release-stage/os-release");
3822                 if (r < 0)
3823                         return r;
3824
3825                 r = strv_extend(&symlinks, "/run/host/os-release");
3826                 if (r < 0)
3827                         return r;
3828         }
3829
3830         *ret_symlinks = TAKE_PTR(symlinks);
3831
3832         return 0;
3833 }
3834
3835 static bool insist_on_sandboxing(
3836                 const ExecContext *context,
3837                 const char *root_dir,
3838                 const char *root_image,
3839                 const BindMount *bind_mounts,
3840                 size_t n_bind_mounts) {
3841
3842         assert(context);
3843         assert(n_bind_mounts == 0 || bind_mounts);
3844
3845         /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
3846          * would alter the view on the file system beyond making things read-only or invisible, i.e. would
3847          * rearrange stuff in a way we cannot ignore gracefully. */
3848
3849         if (context->n_temporary_filesystems > 0)
3850                 return true;
3851
3852         if (root_dir || root_image)
3853                 return true;
3854
3855         if (context->n_mount_images > 0)
3856                 return true;
3857
3858         if (context->dynamic_user)
3859                 return true;
3860
3861         if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
3862                 return true;
3863
3864         /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3865          * essential. */
3866         for (size_t i = 0; i < n_bind_mounts; i++)
3867                 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
3868                         return true;
3869
3870         if (context->log_namespace)
3871                 return true;
3872
3873         return false;
3874 }
3875
3876 static int setup_ephemeral(const ExecContext *context, ExecRuntime *runtime) {
3877         _cleanup_close_ int fd = -EBADF;
3878         int r;
3879
3880         if (!runtime || !runtime->ephemeral_copy)
3881                 return 0;
3882
3883         r = posix_lock(runtime->ephemeral_storage_socket[0], LOCK_EX);
3884         if (r < 0)
3885                 return log_debug_errno(r, "Failed to lock ephemeral storage socket: %m");
3886
3887         CLEANUP_POSIX_UNLOCK(runtime->ephemeral_storage_socket[0]);
3888
3889         fd = receive_one_fd(runtime->ephemeral_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
3890         if (fd >= 0)
3891                 /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */
3892                 return 0;
3893
3894         if (fd != -EAGAIN)
3895                 return log_debug_errno(fd, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
3896
3897         log_debug("Making ephemeral snapshot of %s to %s",
3898                   context->root_image ?: context->root_directory, runtime->ephemeral_copy);
3899
3900         if (context->root_image)
3901                 fd = copy_file(context->root_image, runtime->ephemeral_copy, O_EXCL, 0600,
3902                                COPY_LOCK_BSD|COPY_REFLINK|COPY_CRTIME);
3903         else
3904                 fd = btrfs_subvol_snapshot_at(AT_FDCWD, context->root_directory,
3905                                               AT_FDCWD, runtime->ephemeral_copy,
3906                                               BTRFS_SNAPSHOT_FALLBACK_COPY |
3907                                               BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
3908                                               BTRFS_SNAPSHOT_RECURSIVE |
3909                                               BTRFS_SNAPSHOT_LOCK_BSD);
3910         if (fd < 0)
3911                 return log_debug_errno(fd, "Failed to snapshot %s to %s: %m",
3912                                        context->root_image ?: context->root_directory, runtime->ephemeral_copy);
3913
3914         if (context->root_image) {
3915                 /* A root image might be subject to lots of random writes so let's try to disable COW on it
3916                  * which tends to not perform well in combination with lots of random writes.
3917                  *
3918                  * Note: btrfs actually isn't impressed by us setting the flag after making the reflink'ed
3919                  * copy, but we at least want to make the intention clear.
3920                  */
3921                 r = chattr_fd(fd, FS_NOCOW_FL, FS_NOCOW_FL, NULL);
3922                 if (r < 0)
3923                         log_debug_errno(fd, "Failed to disable copy-on-write for %s, ignoring: %m", runtime->ephemeral_copy);
3924         }
3925
3926         r = send_one_fd(runtime->ephemeral_storage_socket[1], fd, MSG_DONTWAIT);
3927         if (r < 0)
3928                 return log_debug_errno(r, "Failed to queue file descriptor on ephemeral storage socket: %m");
3929
3930         return 1;
3931 }
3932
3933 static int verity_settings_prepare(
3934                 VeritySettings *verity,
3935                 const char *root_image,
3936                 const void *root_hash,
3937                 size_t root_hash_size,
3938                 const char *root_hash_path,
3939                 const void *root_hash_sig,
3940                 size_t root_hash_sig_size,
3941                 const char *root_hash_sig_path,
3942                 const char *verity_data_path) {
3943
3944         int r;
3945
3946         assert(verity);
3947
3948         if (root_hash) {
3949                 void *d;
3950
3951                 d = memdup(root_hash, root_hash_size);
3952                 if (!d)
3953                         return -ENOMEM;
3954
3955                 free_and_replace(verity->root_hash, d);
3956                 verity->root_hash_size = root_hash_size;
3957                 verity->designator = PARTITION_ROOT;
3958         }
3959
3960         if (root_hash_sig) {
3961                 void *d;
3962
3963                 d = memdup(root_hash_sig, root_hash_sig_size);
3964                 if (!d)
3965                         return -ENOMEM;
3966
3967                 free_and_replace(verity->root_hash_sig, d);
3968                 verity->root_hash_sig_size = root_hash_sig_size;
3969                 verity->designator = PARTITION_ROOT;
3970         }
3971
3972         if (verity_data_path) {
3973                 r = free_and_strdup(&verity->data_path, verity_data_path);
3974                 if (r < 0)
3975                         return r;
3976         }
3977
3978         r = verity_settings_load(
3979                         verity,
3980                         root_image,
3981                         root_hash_path,
3982                         root_hash_sig_path);
3983         if (r < 0)
3984                 return log_debug_errno(r, "Failed to load root hash: %m");
3985
3986         return 0;
3987 }
3988
3989 static int apply_mount_namespace(
3990                 const Unit *u,
3991                 ExecCommandFlags command_flags,
3992                 const ExecContext *context,
3993                 const ExecParameters *params,
3994                 ExecRuntime *runtime,
3995                 const char *memory_pressure_path,
3996                 char **error_path) {
3997
3998         _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
3999         _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
4000                         **read_write_paths_cleanup = NULL;
4001         _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
4002                         *extension_dir = NULL, *host_os_release_stage = NULL;
4003         const char *root_dir = NULL, *root_image = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
4004         char **read_write_paths;
4005         NamespaceInfo ns_info;
4006         bool needs_sandboxing, setup_os_release_symlink;
4007         BindMount *bind_mounts = NULL;
4008         size_t n_bind_mounts = 0;
4009         int r;
4010
4011         assert(context);
4012
4013         CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many);
4014
4015         if (params->flags & EXEC_APPLY_CHROOT) {
4016                 r = setup_ephemeral(context, runtime);
4017                 if (r < 0)
4018                         return r;
4019
4020                 if (context->root_image)
4021                         root_image = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_image;
4022                 else
4023                         root_dir = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory;
4024         }
4025
4026         r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
4027         if (r < 0)
4028                 return r;
4029
4030         /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
4031          * service will need to write to it in order to start the notifications. */
4032         if (context->protect_control_groups && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
4033                 read_write_paths_cleanup = strv_copy(context->read_write_paths);
4034                 if (!read_write_paths_cleanup)
4035                         return -ENOMEM;
4036
4037                 r = strv_extend(&read_write_paths_cleanup, memory_pressure_path);
4038                 if (r < 0)
4039                         return r;
4040
4041                 read_write_paths = read_write_paths_cleanup;
4042         } else
4043                 read_write_paths = context->read_write_paths;
4044
4045         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
4046         if (needs_sandboxing) {
4047                 /* The runtime struct only contains the parent of the private /tmp,
4048                  * which is non-accessible to world users. Inside of it there's a /tmp
4049                  * that is sticky, and that's the one we want to use here.
4050                  * This does not apply when we are using /run/systemd/empty as fallback. */
4051
4052                 if (context->private_tmp && runtime && runtime->shared) {
4053                         if (streq_ptr(runtime->shared->tmp_dir, RUN_SYSTEMD_EMPTY))
4054                                 tmp_dir = runtime->shared->tmp_dir;
4055                         else if (runtime->shared->tmp_dir)
4056                                 tmp_dir = strjoina(runtime->shared->tmp_dir, "/tmp");
4057
4058                         if (streq_ptr(runtime->shared->var_tmp_dir, RUN_SYSTEMD_EMPTY))
4059                                 var_tmp_dir = runtime->shared->var_tmp_dir;
4060                         else if (runtime->shared->var_tmp_dir)
4061                                 var_tmp_dir = strjoina(runtime->shared->var_tmp_dir, "/tmp");
4062                 }
4063
4064                 ns_info = (NamespaceInfo) {
4065                         .ignore_protect_paths = false,
4066                         .private_dev = context->private_devices,
4067                         .protect_control_groups = context->protect_control_groups,
4068                         .protect_kernel_tunables = context->protect_kernel_tunables,
4069                         .protect_kernel_modules = context->protect_kernel_modules,
4070                         .protect_kernel_logs = context->protect_kernel_logs,
4071                         .protect_hostname = context->protect_hostname,
4072                         .mount_apivfs = exec_context_get_effective_mount_apivfs(context),
4073                         .protect_home = context->protect_home,
4074                         .protect_system = context->protect_system,
4075                         .protect_proc = context->protect_proc,
4076                         .proc_subset = context->proc_subset,
4077                         .private_network = exec_needs_network_namespace(context),
4078                         .private_ipc = exec_needs_ipc_namespace(context),
4079                         /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
4080                         .mount_nosuid = context->no_new_privileges && !mac_selinux_use(),
4081                 };
4082         } else if (!context->dynamic_user && root_dir)
4083                 /*
4084                  * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
4085                  * sandbox info, otherwise enforce it, don't ignore protected paths and
4086                  * fail if we are enable to apply the sandbox inside the mount namespace.
4087                  */
4088                 ns_info = (NamespaceInfo) {
4089                         .ignore_protect_paths = true,
4090                 };
4091         else
4092                 ns_info = (NamespaceInfo) {};
4093
4094         /* Symlinks (exec dirs, os-release) are set up after other mounts, before they are made read-only. */
4095         setup_os_release_symlink = ns_info.mount_apivfs && (root_dir || root_image);
4096         r = compile_symlinks(context, params, setup_os_release_symlink, &symlinks);
4097         if (r < 0)
4098                 return r;
4099
4100         if (context->mount_propagation_flag == MS_SHARED)
4101                 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
4102
4103         if (exec_context_has_credentials(context) &&
4104             params->prefix[EXEC_DIRECTORY_RUNTIME] &&
4105             FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4106                 creds_path = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials", u->id);
4107                 if (!creds_path)
4108                         return -ENOMEM;
4109         }
4110
4111         if (params->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
4112                 propagate_dir = path_join("/run/systemd/propagate/", u->id);
4113                 if (!propagate_dir)
4114                         return -ENOMEM;
4115
4116                 incoming_dir = strdup("/run/systemd/incoming");
4117                 if (!incoming_dir)
4118                         return -ENOMEM;
4119
4120                 extension_dir = strdup("/run/systemd/unit-extensions");
4121                 if (!extension_dir)
4122                         return -ENOMEM;
4123
4124                 /* If running under a different root filesystem, propagate the host's os-release. We make a
4125                  * copy rather than just bind mounting it, so that it can be updated on soft-reboot. */
4126                 if (setup_os_release_symlink) {
4127                         host_os_release_stage = strdup("/run/systemd/propagate/.os-release-stage");
4128                         if (!host_os_release_stage)
4129                                 return -ENOMEM;
4130                 }
4131         } else {
4132                 assert(params->runtime_scope == RUNTIME_SCOPE_USER);
4133
4134                 if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0)
4135                         return -ENOMEM;
4136
4137                 if (setup_os_release_symlink) {
4138                         if (asprintf(&host_os_release_stage,
4139                                      "/run/user/" UID_FMT "/systemd/propagate/.os-release-stage",
4140                                      geteuid()) < 0)
4141                                 return -ENOMEM;
4142                 }
4143         }
4144
4145         if (root_image) {
4146                 r = verity_settings_prepare(
4147                         &verity,
4148                         root_image,
4149                         context->root_hash, context->root_hash_size, context->root_hash_path,
4150                         context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
4151                         context->root_verity);
4152                 if (r < 0)
4153                         return r;
4154         }
4155
4156         r = setup_namespace(
4157                         root_dir,
4158                         root_image,
4159                         context->root_image_options,
4160                         context->root_image_policy ?: &image_policy_service,
4161                         &ns_info,
4162                         read_write_paths,
4163                         needs_sandboxing ? context->read_only_paths : NULL,
4164                         needs_sandboxing ? context->inaccessible_paths : NULL,
4165                         needs_sandboxing ? context->exec_paths : NULL,
4166                         needs_sandboxing ? context->no_exec_paths : NULL,
4167                         empty_directories,
4168                         symlinks,
4169                         bind_mounts,
4170                         n_bind_mounts,
4171                         context->temporary_filesystems,
4172                         context->n_temporary_filesystems,
4173                         context->mount_images,
4174                         context->n_mount_images,
4175                         context->mount_image_policy ?: &image_policy_service,
4176                         tmp_dir,
4177                         var_tmp_dir,
4178                         creds_path,
4179                         context->log_namespace,
4180                         context->mount_propagation_flag,
4181                         &verity,
4182                         context->extension_images,
4183                         context->n_extension_images,
4184                         context->extension_image_policy ?: &image_policy_sysext,
4185                         context->extension_directories,
4186                         propagate_dir,
4187                         incoming_dir,
4188                         extension_dir,
4189                         root_dir || root_image ? params->notify_socket : NULL,
4190                         host_os_release_stage,
4191                         error_path);
4192
4193         /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
4194          * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
4195          * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
4196          * completely different execution environment. */
4197         if (r == -ENOANO) {
4198                 if (insist_on_sandboxing(
4199                                     context,
4200                                     root_dir, root_image,
4201                                     bind_mounts,
4202                                     n_bind_mounts))
4203                         return log_unit_debug_errno(u,
4204                                                     SYNTHETIC_ERRNO(EOPNOTSUPP),
4205                                                     "Failed to set up namespace, and refusing to continue since "
4206                                                     "the selected namespacing options alter mount environment non-trivially.\n"
4207                                                     "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
4208                                                     n_bind_mounts,
4209                                                     context->n_temporary_filesystems,
4210                                                     yes_no(root_dir),
4211                                                     yes_no(root_image),
4212                                                     yes_no(context->dynamic_user));
4213
4214                 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
4215                 return 0;
4216         }
4217
4218         return r;
4219 }
4220
4221 static int apply_working_directory(
4222                 const ExecContext *context,
4223                 const ExecParameters *params,
4224                 ExecRuntime *runtime,
4225                 const char *home,
4226                 int *exit_status) {
4227
4228         const char *d, *wd;
4229
4230         assert(context);
4231         assert(exit_status);
4232
4233         if (context->working_directory_home) {
4234
4235                 if (!home) {
4236                         *exit_status = EXIT_CHDIR;
4237                         return -ENXIO;
4238                 }
4239
4240                 wd = home;
4241
4242         } else
4243                 wd = empty_to_root(context->working_directory);
4244
4245         if (params->flags & EXEC_APPLY_CHROOT)
4246                 d = wd;
4247         else
4248                 d = prefix_roota((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory, wd);
4249
4250         if (chdir(d) < 0 && !context->working_directory_missing_ok) {
4251                 *exit_status = EXIT_CHDIR;
4252                 return -errno;
4253         }
4254
4255         return 0;
4256 }
4257
4258 static int apply_root_directory(
4259                 const ExecContext *context,
4260                 const ExecParameters *params,
4261                 ExecRuntime *runtime,
4262                 const bool needs_mount_ns,
4263                 int *exit_status) {
4264
4265         assert(context);
4266         assert(exit_status);
4267
4268         if (params->flags & EXEC_APPLY_CHROOT)
4269                 if (!needs_mount_ns && context->root_directory)
4270                         if (chroot((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory) < 0) {
4271                                 *exit_status = EXIT_CHROOT;
4272                                 return -errno;
4273                         }
4274
4275         return 0;
4276 }
4277
4278 static int setup_keyring(
4279                 const Unit *u,
4280                 const ExecContext *context,
4281                 const ExecParameters *p,
4282                 uid_t uid, gid_t gid) {
4283
4284         key_serial_t keyring;
4285         int r = 0;
4286         uid_t saved_uid;
4287         gid_t saved_gid;
4288
4289         assert(u);
4290         assert(context);
4291         assert(p);
4292
4293         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
4294          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
4295          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
4296          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
4297          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
4298          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
4299
4300         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
4301                 return 0;
4302
4303         /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
4304          * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
4305          * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
4306          * & group is just as nasty as acquiring a reference to the user keyring. */
4307
4308         saved_uid = getuid();
4309         saved_gid = getgid();
4310
4311         if (gid_is_valid(gid) && gid != saved_gid) {
4312                 if (setregid(gid, -1) < 0)
4313                         return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
4314         }
4315
4316         if (uid_is_valid(uid) && uid != saved_uid) {
4317                 if (setreuid(uid, -1) < 0) {
4318                         r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
4319                         goto out;
4320                 }
4321         }
4322
4323         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
4324         if (keyring == -1) {
4325                 if (errno == ENOSYS)
4326                         log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
4327                 else if (ERRNO_IS_PRIVILEGE(errno))
4328                         log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
4329                 else if (errno == EDQUOT)
4330                         log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
4331                 else
4332                         r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
4333
4334                 goto out;
4335         }
4336
4337         /* When requested link the user keyring into the session keyring. */
4338         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
4339
4340                 if (keyctl(KEYCTL_LINK,
4341                            KEY_SPEC_USER_KEYRING,
4342                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
4343                         r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
4344                         goto out;
4345                 }
4346         }
4347
4348         /* Restore uid/gid back */
4349         if (uid_is_valid(uid) && uid != saved_uid) {
4350                 if (setreuid(saved_uid, -1) < 0) {
4351                         r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
4352                         goto out;
4353                 }
4354         }
4355
4356         if (gid_is_valid(gid) && gid != saved_gid) {
4357                 if (setregid(saved_gid, -1) < 0)
4358                         return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
4359         }
4360
4361         /* Populate they keyring with the invocation ID by default, as original saved_uid. */
4362         if (!sd_id128_is_null(u->invocation_id)) {
4363                 key_serial_t key;
4364
4365                 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
4366                 if (key == -1)
4367                         log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
4368                 else {
4369                         if (keyctl(KEYCTL_SETPERM, key,
4370                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
4371                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
4372                                 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
4373                 }
4374         }
4375
4376 out:
4377         /* Revert back uid & gid for the last time, and exit */
4378         /* no extra logging, as only the first already reported error matters */
4379         if (getuid() != saved_uid)
4380                 (void) setreuid(saved_uid, -1);
4381
4382         if (getgid() != saved_gid)
4383                 (void) setregid(saved_gid, -1);
4384
4385         return r;
4386 }
4387
4388 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
4389         assert(array);
4390         assert(n);
4391         assert(pair);
4392
4393         if (pair[0] >= 0)
4394                 array[(*n)++] = pair[0];
4395         if (pair[1] >= 0)
4396                 array[(*n)++] = pair[1];
4397 }
4398
4399 static int close_remaining_fds(
4400                 const ExecParameters *params,
4401                 const ExecRuntime *runtime,
4402                 int user_lookup_fd,
4403                 int socket_fd,
4404                 const int *fds, size_t n_fds) {
4405
4406         size_t n_dont_close = 0;
4407         int dont_close[n_fds + 14];
4408
4409         assert(params);
4410
4411         if (params->stdin_fd >= 0)
4412                 dont_close[n_dont_close++] = params->stdin_fd;
4413         if (params->stdout_fd >= 0)
4414                 dont_close[n_dont_close++] = params->stdout_fd;
4415         if (params->stderr_fd >= 0)
4416                 dont_close[n_dont_close++] = params->stderr_fd;
4417
4418         if (socket_fd >= 0)
4419                 dont_close[n_dont_close++] = socket_fd;
4420         if (n_fds > 0) {
4421                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
4422                 n_dont_close += n_fds;
4423         }
4424
4425         if (runtime)
4426                 append_socket_pair(dont_close, &n_dont_close, runtime->ephemeral_storage_socket);
4427
4428         if (runtime && runtime->shared) {
4429                 append_socket_pair(dont_close, &n_dont_close, runtime->shared->netns_storage_socket);
4430                 append_socket_pair(dont_close, &n_dont_close, runtime->shared->ipcns_storage_socket);
4431         }
4432
4433         if (runtime && runtime->dynamic_creds) {
4434                 if (runtime->dynamic_creds->user)
4435                         append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->user->storage_socket);
4436                 if (runtime->dynamic_creds->group)
4437                         append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->group->storage_socket);
4438         }
4439
4440         if (user_lookup_fd >= 0)
4441                 dont_close[n_dont_close++] = user_lookup_fd;
4442
4443         return close_all_fds(dont_close, n_dont_close);
4444 }
4445
4446 static int send_user_lookup(
4447                 Unit *unit,
4448                 int user_lookup_fd,
4449                 uid_t uid,
4450                 gid_t gid) {
4451
4452         assert(unit);
4453
4454         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
4455          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
4456          * specified. */
4457
4458         if (user_lookup_fd < 0)
4459                 return 0;
4460
4461         if (!uid_is_valid(uid) && !gid_is_valid(gid))
4462                 return 0;
4463
4464         if (writev(user_lookup_fd,
4465                (struct iovec[]) {
4466                            IOVEC_MAKE(&uid, sizeof(uid)),
4467                            IOVEC_MAKE(&gid, sizeof(gid)),
4468                            IOVEC_MAKE_STRING(unit->id) }, 3) < 0)
4469                 return -errno;
4470
4471         return 0;
4472 }
4473
4474 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
4475         int r;
4476
4477         assert(c);
4478         assert(home);
4479         assert(buf);
4480
4481         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
4482
4483         if (*home)
4484                 return 0;
4485
4486         if (!c->working_directory_home)
4487                 return 0;
4488
4489         r = get_home_dir(buf);
4490         if (r < 0)
4491                 return r;
4492
4493         *home = *buf;
4494         return 1;
4495 }
4496
4497 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
4498         _cleanup_strv_free_ char ** list = NULL;
4499         int r;
4500
4501         assert(c);
4502         assert(p);
4503         assert(ret);
4504
4505         assert(c->dynamic_user);
4506
4507         /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
4508          * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
4509          * directories. */
4510
4511         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
4512                 if (t == EXEC_DIRECTORY_CONFIGURATION)
4513                         continue;
4514
4515                 if (!p->prefix[t])
4516                         continue;
4517
4518                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
4519                         char *e;
4520
4521                         if (exec_directory_is_private(c, t))
4522                                 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
4523                         else
4524                                 e = path_join(p->prefix[t], c->directories[t].items[i].path);
4525                         if (!e)
4526                                 return -ENOMEM;
4527
4528                         r = strv_consume(&list, e);
4529                         if (r < 0)
4530                                 return r;
4531                 }
4532         }
4533
4534         *ret = TAKE_PTR(list);
4535
4536         return 0;
4537 }
4538
4539 static int exec_parameters_get_cgroup_path(
4540                 const ExecParameters *params,
4541                 const CGroupContext *c,
4542                 char **ret) {
4543
4544         const char *subgroup = NULL;
4545         char *p;
4546
4547         assert(params);
4548         assert(ret);
4549
4550         if (!params->cgroup_path)
4551                 return -EINVAL;
4552
4553         /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
4554          * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
4555          * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
4556          * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
4557          * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
4558          * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
4559          * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
4560          * flag, which is only passed for the former statements, not for the latter. */
4561
4562         if (FLAGS_SET(params->flags, EXEC_CGROUP_DELEGATE) && (FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP) || c->delegate_subgroup)) {
4563                 if (FLAGS_SET(params->flags, EXEC_IS_CONTROL))
4564                         subgroup = ".control";
4565                 else
4566                         subgroup = c->delegate_subgroup;
4567         }
4568
4569         if (subgroup)
4570                 p = path_join(params->cgroup_path, subgroup);
4571         else
4572                 p = strdup(params->cgroup_path);
4573         if (!p)
4574                 return -ENOMEM;
4575
4576         *ret = p;
4577         return !!subgroup;
4578 }
4579
4580 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
4581         _cleanup_(cpu_set_reset) CPUSet s = {};
4582         int r;
4583
4584         assert(c);
4585         assert(ret);
4586
4587         if (!c->numa_policy.nodes.set) {
4588                 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
4589                 return 0;
4590         }
4591
4592         r = numa_to_cpu_set(&c->numa_policy, &s);
4593         if (r < 0)
4594                 return r;
4595
4596         cpu_set_reset(ret);
4597
4598         return cpu_set_add_all(ret, &s);
4599 }
4600
4601 bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
4602         assert(c);
4603
4604         return c->cpu_affinity_from_numa;
4605 }
4606
4607 static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
4608         int r;
4609
4610         assert(fds);
4611         assert(n_fds);
4612         assert(*n_fds < fds_size);
4613         assert(ret_fd);
4614
4615         if (fd < 0) {
4616                 *ret_fd = -EBADF;
4617                 return 0;
4618         }
4619
4620         if (fd < 3 + (int) *n_fds) {
4621                 /* Let's move the fd up, so that it's outside of the fd range we will use to store
4622                  * the fds we pass to the process (or which are closed only during execve). */
4623
4624                 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
4625                 if (r < 0)
4626                         return -errno;
4627
4628                 close_and_replace(fd, r);
4629         }
4630
4631         *ret_fd = fds[*n_fds] = fd;
4632         (*n_fds) ++;
4633         return 1;
4634 }
4635
4636 static int connect_unix_harder(Unit *u, const OpenFile *of, int ofd) {
4637         union sockaddr_union addr = {
4638                 .un.sun_family = AF_UNIX,
4639         };
4640         socklen_t sa_len;
4641         static const int socket_types[] = { SOCK_DGRAM, SOCK_STREAM, SOCK_SEQPACKET };
4642         int r;
4643
4644         assert(u);
4645         assert(of);
4646         assert(ofd >= 0);
4647
4648         r = sockaddr_un_set_path(&addr.un, FORMAT_PROC_FD_PATH(ofd));
4649         if (r < 0)
4650                 return log_unit_error_errno(u, r, "Failed to set sockaddr for %s: %m", of->path);
4651
4652         sa_len = r;
4653
4654         for (size_t i = 0; i < ELEMENTSOF(socket_types); i++) {
4655                 _cleanup_close_ int fd = -EBADF;
4656
4657                 fd = socket(AF_UNIX, socket_types[i] | SOCK_CLOEXEC, 0);
4658                 if (fd < 0)
4659                         return log_unit_error_errno(u, errno, "Failed to create socket for %s: %m", of->path);
4660
4661                 r = RET_NERRNO(connect(fd, &addr.sa, sa_len));
4662                 if (r == -EPROTOTYPE)
4663                         continue;
4664                 if (r < 0)
4665                         return log_unit_error_errno(u, r, "Failed to connect socket for %s: %m", of->path);
4666
4667                 return TAKE_FD(fd);
4668         }
4669
4670         return log_unit_error_errno(u, SYNTHETIC_ERRNO(EPROTOTYPE), "Failed to connect socket for \"%s\".", of->path);
4671 }
4672
4673 static int get_open_file_fd(Unit *u, const OpenFile *of) {
4674         struct stat st;
4675         _cleanup_close_ int fd = -EBADF, ofd = -EBADF;
4676
4677         assert(u);
4678         assert(of);
4679
4680         ofd = open(of->path, O_PATH | O_CLOEXEC);
4681         if (ofd < 0)
4682                 return log_unit_error_errno(u, errno, "Could not open \"%s\": %m", of->path);
4683
4684         if (fstat(ofd, &st) < 0)
4685                 return log_unit_error_errno(u, errno, "Failed to stat %s: %m", of->path);
4686
4687         if (S_ISSOCK(st.st_mode)) {
4688                 fd = connect_unix_harder(u, of, ofd);
4689                 if (fd < 0)
4690                         return fd;
4691
4692                 if (FLAGS_SET(of->flags, OPENFILE_READ_ONLY) && shutdown(fd, SHUT_WR) < 0)
4693                         return log_unit_error_errno(u, errno, "Failed to shutdown send for socket %s: %m",
4694                                                     of->path);
4695
4696                 log_unit_debug(u, "socket %s opened (fd=%d)", of->path, fd);
4697         } else {
4698                 int flags = FLAGS_SET(of->flags, OPENFILE_READ_ONLY) ? O_RDONLY : O_RDWR;
4699                 if (FLAGS_SET(of->flags, OPENFILE_APPEND))
4700                         flags |= O_APPEND;
4701                 else if (FLAGS_SET(of->flags, OPENFILE_TRUNCATE))
4702                         flags |= O_TRUNC;
4703
4704                 fd = fd_reopen(ofd, flags | O_CLOEXEC);
4705                 if (fd < 0)
4706                         return log_unit_error_errno(u, fd, "Failed to open file %s: %m", of->path);
4707
4708                 log_unit_debug(u, "file %s opened (fd=%d)", of->path, fd);
4709         }
4710
4711         return TAKE_FD(fd);
4712 }
4713
4714 static int collect_open_file_fds(
4715                 Unit *u,
4716                 OpenFile* open_files,
4717                 int **fds,
4718                 char ***fdnames,
4719                 size_t *n_fds) {
4720         int r;
4721
4722         assert(u);
4723         assert(fds);
4724         assert(fdnames);
4725         assert(n_fds);
4726
4727         LIST_FOREACH(open_files, of, open_files) {
4728                 _cleanup_close_ int fd = -EBADF;
4729
4730                 fd = get_open_file_fd(u, of);
4731                 if (fd < 0) {
4732                         if (FLAGS_SET(of->flags, OPENFILE_GRACEFUL)) {
4733                                 log_unit_debug_errno(u, fd, "Failed to get OpenFile= file descriptor for %s, ignoring: %m", of->path);
4734                                 continue;
4735                         }
4736
4737                         return fd;
4738                 }
4739
4740                 if (!GREEDY_REALLOC(*fds, *n_fds + 1))
4741                         return -ENOMEM;
4742
4743                 r = strv_extend(fdnames, of->fdname);
4744                 if (r < 0)
4745                         return r;
4746
4747                 (*fds)[*n_fds] = TAKE_FD(fd);
4748
4749                 (*n_fds)++;
4750         }
4751
4752         return 0;
4753 }
4754
4755 static void log_command_line(Unit *unit, const char *msg, const char *executable, char **argv) {
4756         assert(unit);
4757         assert(msg);
4758         assert(executable);
4759
4760         if (!DEBUG_LOGGING)
4761                 return;
4762
4763         _cleanup_free_ char *cmdline = quote_command_line(argv, SHELL_ESCAPE_EMPTY);
4764
4765         log_unit_struct(unit, LOG_DEBUG,
4766                         "EXECUTABLE=%s", executable,
4767                         LOG_UNIT_MESSAGE(unit, "%s: %s", msg, strnull(cmdline)),
4768                         LOG_UNIT_INVOCATION_ID(unit));
4769 }
4770
4771 static bool exec_context_need_unprivileged_private_users(
4772                 const ExecContext *context,
4773                 const ExecParameters *params) {
4774
4775         assert(context);
4776         assert(params);
4777
4778         /* These options require PrivateUsers= when used in user units, as we need to be in a user namespace
4779          * to have permission to enable them when not running as root. If we have effective CAP_SYS_ADMIN
4780          * (system manager) then we have privileges and don't need this. */
4781         if (params->runtime_scope != RUNTIME_SCOPE_USER)
4782                 return false;
4783
4784         return context->private_users ||
4785                context->private_tmp ||
4786                context->private_devices ||
4787                context->private_network ||
4788                context->network_namespace_path ||
4789                context->private_ipc ||
4790                context->ipc_namespace_path ||
4791                context->private_mounts > 0 ||
4792                context->mount_apivfs ||
4793                context->n_bind_mounts > 0 ||
4794                context->n_temporary_filesystems > 0 ||
4795                context->root_directory ||
4796                !strv_isempty(context->extension_directories) ||
4797                context->protect_system != PROTECT_SYSTEM_NO ||
4798                context->protect_home != PROTECT_HOME_NO ||
4799                context->protect_kernel_tunables ||
4800                context->protect_kernel_modules ||
4801                context->protect_kernel_logs ||
4802                context->protect_control_groups ||
4803                context->protect_clock ||
4804                context->protect_hostname ||
4805                !strv_isempty(context->read_write_paths) ||
4806                !strv_isempty(context->read_only_paths) ||
4807                !strv_isempty(context->inaccessible_paths) ||
4808                !strv_isempty(context->exec_paths) ||
4809                !strv_isempty(context->no_exec_paths);
4810 }
4811
4812 static int exec_child(
4813                 Unit *unit,
4814                 const ExecCommand *command,
4815                 const ExecContext *context,
4816                 const ExecParameters *params,
4817                 ExecRuntime *runtime,
4818                 const CGroupContext *cgroup_context,
4819                 int socket_fd,
4820                 const int named_iofds[static 3],
4821                 int *params_fds,
4822                 size_t n_socket_fds,
4823                 size_t n_storage_fds,
4824                 char **files_env,
4825                 int user_lookup_fd,
4826                 int *exit_status) {
4827
4828         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
4829         int r, ngids = 0, exec_fd;
4830         _cleanup_free_ gid_t *supplementary_gids = NULL;
4831         const char *username = NULL, *groupname = NULL;
4832         _cleanup_free_ char *home_buffer = NULL, *memory_pressure_path = NULL;
4833         const char *home = NULL, *shell = NULL;
4834         char **final_argv = NULL;
4835         dev_t journal_stream_dev = 0;
4836         ino_t journal_stream_ino = 0;
4837         bool userns_set_up = false;
4838         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
4839                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
4840                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
4841                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
4842 #if HAVE_SELINUX
4843         _cleanup_free_ char *mac_selinux_context_net = NULL;
4844         bool use_selinux = false;
4845 #endif
4846 #if ENABLE_SMACK
4847         bool use_smack = false;
4848 #endif
4849 #if HAVE_APPARMOR
4850         bool use_apparmor = false;
4851 #endif
4852         uid_t saved_uid = getuid();
4853         gid_t saved_gid = getgid();
4854         uid_t uid = UID_INVALID;
4855         gid_t gid = GID_INVALID;
4856         size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
4857                n_keep_fds; /* total number of fds not to close */
4858         int secure_bits;
4859         _cleanup_free_ gid_t *gids_after_pam = NULL;
4860         int ngids_after_pam = 0;
4861         _cleanup_free_ int *fds = NULL;
4862         _cleanup_strv_free_ char **fdnames = NULL;
4863
4864         assert(unit);
4865         assert(command);
4866         assert(context);
4867         assert(params);
4868         assert(exit_status);
4869
4870         /* Explicitly test for CVE-2021-4034 inspired invocations */
4871         assert(command->path);
4872         assert(!strv_isempty(command->argv));
4873
4874         rename_process_from_path(command->path);
4875
4876         /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4877          * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4878          * both of which will be demoted to SIG_DFL. */
4879         (void) default_signals(SIGNALS_CRASH_HANDLER,
4880                                SIGNALS_IGNORE);
4881
4882         if (context->ignore_sigpipe)
4883                 (void) ignore_signals(SIGPIPE);
4884
4885         r = reset_signal_mask();
4886         if (r < 0) {
4887                 *exit_status = EXIT_SIGNAL_MASK;
4888                 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
4889         }
4890
4891         if (params->idle_pipe)
4892                 do_idle_pipe_dance(params->idle_pipe);
4893
4894         /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4895          * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4896          * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4897          * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
4898
4899         log_forget_fds();
4900         log_set_open_when_needed(true);
4901         log_settle_target();
4902
4903         /* In case anything used libc syslog(), close this here, too */
4904         closelog();
4905
4906         fds = newdup(int, params_fds, n_fds);
4907         if (!fds) {
4908                 *exit_status = EXIT_MEMORY;
4909                 return log_oom();
4910         }
4911
4912         fdnames = strv_copy((char**) params->fd_names);
4913         if (!fdnames) {
4914                 *exit_status = EXIT_MEMORY;
4915                 return log_oom();
4916         }
4917
4918         r = collect_open_file_fds(unit, params->open_files, &fds, &fdnames, &n_fds);
4919         if (r < 0) {
4920                 *exit_status = EXIT_FDS;
4921                 return log_unit_error_errno(unit, r, "Failed to get OpenFile= file descriptors: %m");
4922         }
4923
4924         int keep_fds[n_fds + 3];
4925         memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
4926         n_keep_fds = n_fds;
4927
4928         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
4929         if (r < 0) {
4930                 *exit_status = EXIT_FDS;
4931                 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4932         }
4933
4934 #if HAVE_LIBBPF
4935         if (unit->manager->restrict_fs) {
4936                 int bpf_map_fd = lsm_bpf_map_restrict_fs_fd(unit);
4937                 if (bpf_map_fd < 0) {
4938                         *exit_status = EXIT_FDS;
4939                         return log_unit_error_errno(unit, bpf_map_fd, "Failed to get restrict filesystems BPF map fd: %m");
4940                 }
4941
4942                 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, bpf_map_fd, &bpf_map_fd);
4943                 if (r < 0) {
4944                         *exit_status = EXIT_FDS;
4945                         return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4946                 }
4947         }
4948 #endif
4949
4950         r = close_remaining_fds(params, runtime, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
4951         if (r < 0) {
4952                 *exit_status = EXIT_FDS;
4953                 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
4954         }
4955
4956         if (!context->same_pgrp &&
4957             setsid() < 0) {
4958                 *exit_status = EXIT_SETSID;
4959                 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
4960         }
4961
4962         exec_context_tty_reset(context, params);
4963
4964         if (unit_shall_confirm_spawn(unit)) {
4965                 _cleanup_free_ char *cmdline = NULL;
4966
4967                 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
4968                 if (!cmdline) {
4969                         *exit_status = EXIT_MEMORY;
4970                         return log_oom();
4971                 }
4972
4973                 r = ask_for_confirmation(context, params->confirm_spawn, unit, cmdline);
4974                 if (r != CONFIRM_EXECUTE) {
4975                         if (r == CONFIRM_PRETEND_SUCCESS) {
4976                                 *exit_status = EXIT_SUCCESS;
4977                                 return 0;
4978                         }
4979
4980                         *exit_status = EXIT_CONFIRM;
4981                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
4982                                                     "Execution cancelled by the user");
4983                 }
4984         }
4985
4986         /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4987          * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4988          * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4989          * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4990          * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4991         if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
4992             setenv("SYSTEMD_ACTIVATION_SCOPE", runtime_scope_to_string(params->runtime_scope), true) != 0) {
4993                 *exit_status = EXIT_MEMORY;
4994                 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4995         }
4996
4997         if (context->dynamic_user && runtime && runtime->dynamic_creds) {
4998                 _cleanup_strv_free_ char **suggested_paths = NULL;
4999
5000                 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
5001                  * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
5002                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
5003                         *exit_status = EXIT_USER;
5004                         return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
5005                 }
5006
5007                 r = compile_suggested_paths(context, params, &suggested_paths);
5008                 if (r < 0) {
5009                         *exit_status = EXIT_MEMORY;
5010                         return log_oom();
5011                 }
5012
5013                 r = dynamic_creds_realize(runtime->dynamic_creds, suggested_paths, &uid, &gid);
5014                 if (r < 0) {
5015                         *exit_status = EXIT_USER;
5016                         if (r == -EILSEQ)
5017                                 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
5018                                                             "Failed to update dynamic user credentials: User or group with specified name already exists.");
5019                         return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
5020                 }
5021
5022                 if (!uid_is_valid(uid)) {
5023                         *exit_status = EXIT_USER;
5024                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
5025                 }
5026
5027                 if (!gid_is_valid(gid)) {
5028                         *exit_status = EXIT_USER;
5029                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
5030                 }
5031
5032                 if (runtime->dynamic_creds->user)
5033                         username = runtime->dynamic_creds->user->name;
5034
5035         } else {
5036                 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
5037                 if (r < 0) {
5038                         *exit_status = EXIT_USER;
5039                         return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
5040                 }
5041
5042                 r = get_fixed_group(context, &groupname, &gid);
5043                 if (r < 0) {
5044                         *exit_status = EXIT_GROUP;
5045                         return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
5046                 }
5047         }
5048
5049         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
5050         r = get_supplementary_groups(context, username, groupname, gid,
5051                                      &supplementary_gids, &ngids);
5052         if (r < 0) {
5053                 *exit_status = EXIT_GROUP;
5054                 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
5055         }
5056
5057         r = send_user_lookup(unit, user_lookup_fd, uid, gid);
5058         if (r < 0) {
5059                 *exit_status = EXIT_USER;
5060                 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
5061         }
5062
5063         user_lookup_fd = safe_close(user_lookup_fd);
5064
5065         r = acquire_home(context, uid, &home, &home_buffer);
5066         if (r < 0) {
5067                 *exit_status = EXIT_CHDIR;
5068                 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
5069         }
5070
5071         /* If a socket is connected to STDIN/STDOUT/STDERR, we must drop O_NONBLOCK */
5072         if (socket_fd >= 0)
5073                 (void) fd_nonblock(socket_fd, false);
5074
5075         /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
5076          * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
5077         if (params->cgroup_path) {
5078                 _cleanup_free_ char *p = NULL;
5079
5080                 r = exec_parameters_get_cgroup_path(params, cgroup_context, &p);
5081                 if (r < 0) {
5082                         *exit_status = EXIT_CGROUP;
5083                         return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
5084                 }
5085
5086                 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
5087                 if (r == -EUCLEAN) {
5088                         *exit_status = EXIT_CGROUP;
5089                         return log_unit_error_errno(unit, r, "Failed to attach process to cgroup %s "
5090                                                     "because the cgroup or one of its parents or "
5091                                                     "siblings is in the threaded mode: %m", p);
5092                 }
5093                 if (r < 0) {
5094                         *exit_status = EXIT_CGROUP;
5095                         return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
5096                 }
5097         }
5098
5099         if (context->network_namespace_path && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
5100                 r = open_shareable_ns_path(runtime->shared->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
5101                 if (r < 0) {
5102                         *exit_status = EXIT_NETWORK;
5103                         return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
5104                 }
5105         }
5106
5107         if (context->ipc_namespace_path && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
5108                 r = open_shareable_ns_path(runtime->shared->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
5109                 if (r < 0) {
5110                         *exit_status = EXIT_NAMESPACE;
5111                         return log_unit_error_errno(unit, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
5112                 }
5113         }
5114
5115         r = setup_input(context, params, socket_fd, named_iofds);
5116         if (r < 0) {
5117                 *exit_status = EXIT_STDIN;
5118                 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
5119         }
5120
5121         r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
5122         if (r < 0) {
5123                 *exit_status = EXIT_STDOUT;
5124                 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
5125         }
5126
5127         r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
5128         if (r < 0) {
5129                 *exit_status = EXIT_STDERR;
5130                 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
5131         }
5132
5133         if (context->oom_score_adjust_set) {
5134                 /* When we can't make this change due to EPERM, then let's silently skip over it. User
5135                  * namespaces prohibit write access to this file, and we shouldn't trip up over that. */
5136                 r = set_oom_score_adjust(context->oom_score_adjust);
5137                 if (ERRNO_IS_NEG_PRIVILEGE(r))
5138                         log_unit_debug_errno(unit, r,
5139                                              "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
5140                 else if (r < 0) {
5141                         *exit_status = EXIT_OOM_ADJUST;
5142                         return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
5143                 }
5144         }
5145
5146         if (context->coredump_filter_set) {
5147                 r = set_coredump_filter(context->coredump_filter);
5148                 if (ERRNO_IS_NEG_PRIVILEGE(r))
5149                         log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
5150                 else if (r < 0) {
5151                         *exit_status = EXIT_LIMITS;
5152                         return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
5153                 }
5154         }
5155
5156         if (context->nice_set) {
5157                 r = setpriority_closest(context->nice);
5158                 if (r < 0) {
5159                         *exit_status = EXIT_NICE;
5160                         return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
5161                 }
5162         }
5163
5164         if (context->cpu_sched_set) {
5165                 struct sched_param param = {
5166                         .sched_priority = context->cpu_sched_priority,
5167                 };
5168
5169                 r = sched_setscheduler(0,
5170                                        context->cpu_sched_policy |
5171                                        (context->cpu_sched_reset_on_fork ?
5172                                         SCHED_RESET_ON_FORK : 0),
5173                                        &param);
5174                 if (r < 0) {
5175                         *exit_status = EXIT_SETSCHEDULER;
5176                         return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
5177                 }
5178         }
5179
5180         if (context->cpu_affinity_from_numa || context->cpu_set.set) {
5181                 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
5182                 const CPUSet *cpu_set;
5183
5184                 if (context->cpu_affinity_from_numa) {
5185                         r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
5186                         if (r < 0) {
5187                                 *exit_status = EXIT_CPUAFFINITY;
5188                                 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
5189                         }
5190
5191                         cpu_set = &converted_cpu_set;
5192                 } else
5193                         cpu_set = &context->cpu_set;
5194
5195                 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
5196                         *exit_status = EXIT_CPUAFFINITY;
5197                         return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
5198                 }
5199         }
5200
5201         if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
5202                 r = apply_numa_policy(&context->numa_policy);
5203                 if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
5204                         log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
5205                 else if (r < 0) {
5206                         *exit_status = EXIT_NUMA_POLICY;
5207                         return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
5208                 }
5209         }
5210
5211         if (context->ioprio_set)
5212                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
5213                         *exit_status = EXIT_IOPRIO;
5214                         return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
5215                 }
5216
5217         if (context->timer_slack_nsec != NSEC_INFINITY)
5218                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
5219                         *exit_status = EXIT_TIMERSLACK;
5220                         return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
5221                 }
5222
5223         if (context->personality != PERSONALITY_INVALID) {
5224                 r = safe_personality(context->personality);
5225                 if (r < 0) {
5226                         *exit_status = EXIT_PERSONALITY;
5227                         return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
5228                 }
5229         }
5230
5231         if (context->utmp_id) {
5232                 const char *line = context->tty_path ?
5233                         (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
5234                         NULL;
5235                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
5236                                       line,
5237                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
5238                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
5239                                       USER_PROCESS,
5240                                       username);
5241         }
5242
5243         if (uid_is_valid(uid)) {
5244                 r = chown_terminal(STDIN_FILENO, uid);
5245                 if (r < 0) {
5246                         *exit_status = EXIT_STDIN;
5247                         return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
5248                 }
5249         }
5250
5251         if (params->cgroup_path) {
5252                 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
5253                  * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
5254                  * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
5255                  * touch a single hierarchy too. */
5256
5257                 if (params->flags & EXEC_CGROUP_DELEGATE) {
5258                         _cleanup_free_ char *p = NULL;
5259
5260                         r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
5261                         if (r < 0) {
5262                                 *exit_status = EXIT_CGROUP;
5263                                 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
5264                         }
5265
5266                         r = exec_parameters_get_cgroup_path(params, cgroup_context, &p);
5267                         if (r < 0) {
5268                                 *exit_status = EXIT_CGROUP;
5269                                 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
5270                         }
5271                         if (r > 0) {
5272                                 r = cg_set_access_recursive(SYSTEMD_CGROUP_CONTROLLER, p, uid, gid);
5273                                 if (r < 0) {
5274                                         *exit_status = EXIT_CGROUP;
5275                                         return log_unit_error_errno(unit, r, "Failed to adjust control subgroup access: %m");
5276                                 }
5277                         }
5278                 }
5279
5280                 if (cgroup_context && cg_unified() > 0 && is_pressure_supported() > 0) {
5281                         if (cgroup_context_want_memory_pressure(cgroup_context)) {
5282                                 r = cg_get_path("memory", params->cgroup_path, "memory.pressure", &memory_pressure_path);
5283                                 if (r < 0) {
5284                                         *exit_status = EXIT_MEMORY;
5285                                         return log_oom();
5286                                 }
5287
5288                                 r = chmod_and_chown(memory_pressure_path, 0644, uid, gid);
5289                                 if (r < 0) {
5290                                         log_unit_full_errno(unit, r == -ENOENT || ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
5291                                                             "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path);
5292                                         memory_pressure_path = mfree(memory_pressure_path);
5293                                 }
5294                         } else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_OFF) {
5295                                 memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
5296                                 if (!memory_pressure_path) {
5297                                         *exit_status = EXIT_MEMORY;
5298                                         return log_oom();
5299                                 }
5300                         }
5301                 }
5302         }
5303
5304         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
5305
5306         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
5307                 r = setup_exec_directory(unit, context, params, uid, gid, dt, needs_mount_namespace, exit_status);
5308                 if (r < 0)
5309                         return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
5310         }
5311
5312         if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
5313                 r = setup_credentials(context, params, unit->id, uid, gid);
5314                 if (r < 0) {
5315                         *exit_status = EXIT_CREDENTIALS;
5316                         return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
5317                 }
5318         }
5319
5320         r = build_environment(
5321                         unit,
5322                         context,
5323                         params,
5324                         cgroup_context,
5325                         n_fds,
5326                         fdnames,
5327                         home,
5328                         username,
5329                         shell,
5330                         journal_stream_dev,
5331                         journal_stream_ino,
5332                         memory_pressure_path,
5333                         &our_env);
5334         if (r < 0) {
5335                 *exit_status = EXIT_MEMORY;
5336                 return log_oom();
5337         }
5338
5339         r = build_pass_environment(context, &pass_env);
5340         if (r < 0) {
5341                 *exit_status = EXIT_MEMORY;
5342                 return log_oom();
5343         }
5344
5345         /* The $PATH variable is set to the default path in params->environment. However, this is overridden
5346          * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
5347          * not specify PATH but the unit has ExecSearchPath. */
5348         if (!strv_isempty(context->exec_search_path)) {
5349                 _cleanup_free_ char *joined = NULL;
5350
5351                 joined = strv_join(context->exec_search_path, ":");
5352                 if (!joined) {
5353                         *exit_status = EXIT_MEMORY;
5354                         return log_oom();
5355                 }
5356
5357                 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
5358                 if (r < 0) {
5359                         *exit_status = EXIT_MEMORY;
5360                         return log_oom();
5361                 }
5362         }
5363
5364         accum_env = strv_env_merge(params->environment,
5365                                    our_env,
5366                                    joined_exec_search_path,
5367                                    pass_env,
5368                                    context->environment,
5369                                    files_env);
5370         if (!accum_env) {
5371                 *exit_status = EXIT_MEMORY;
5372                 return log_oom();
5373         }
5374         accum_env = strv_env_clean(accum_env);
5375
5376         (void) umask(context->umask);
5377
5378         r = setup_keyring(unit, context, params, uid, gid);
5379         if (r < 0) {
5380                 *exit_status = EXIT_KEYRING;
5381                 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
5382         }
5383
5384         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
5385          * from it. */
5386         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
5387
5388         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
5389          * for it, and the kernel doesn't actually support ambient caps. */
5390         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
5391
5392         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
5393          * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
5394          * desired. */
5395         if (needs_ambient_hack)
5396                 needs_setuid = false;
5397         else
5398                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
5399
5400         uint64_t capability_ambient_set = context->capability_ambient_set;
5401
5402         if (needs_sandboxing) {
5403                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
5404                  * /sys being present. The actual MAC context application will happen later, as late as
5405                  * possible, to avoid impacting our own code paths. */
5406
5407 #if HAVE_SELINUX
5408                 use_selinux = mac_selinux_use();
5409 #endif
5410 #if ENABLE_SMACK
5411                 use_smack = mac_smack_use();
5412 #endif
5413 #if HAVE_APPARMOR
5414                 use_apparmor = mac_apparmor_use();
5415 #endif
5416         }
5417
5418         if (needs_sandboxing) {
5419                 int which_failed;
5420
5421                 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
5422                  * is set here. (See below.) */
5423
5424                 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
5425                 if (r < 0) {
5426                         *exit_status = EXIT_LIMITS;
5427                         return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
5428                 }
5429         }
5430
5431         if (needs_setuid && context->pam_name && username) {
5432                 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
5433                  * wins here. (See above.) */
5434
5435                 /* All fds passed in the fds array will be closed in the pam child process. */
5436                 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
5437                 if (r < 0) {
5438                         *exit_status = EXIT_PAM;
5439                         return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
5440                 }
5441
5442                 if (ambient_capabilities_supported()) {
5443                         uint64_t ambient_after_pam;
5444
5445                         /* PAM modules might have set some ambient caps. Query them here and merge them into
5446                          * the caps we want to set in the end, so that we don't end up unsetting them. */
5447                         r = capability_get_ambient(&ambient_after_pam);
5448                         if (r < 0) {
5449                                 *exit_status = EXIT_CAPABILITIES;
5450                                 return log_unit_error_errno(unit, r, "Failed to query ambient caps: %m");
5451                         }
5452
5453                         capability_ambient_set |= ambient_after_pam;
5454                 }
5455
5456                 ngids_after_pam = getgroups_alloc(&gids_after_pam);
5457                 if (ngids_after_pam < 0) {
5458                         *exit_status = EXIT_MEMORY;
5459                         return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
5460                 }
5461         }
5462
5463         if (needs_sandboxing && exec_context_need_unprivileged_private_users(context, params)) {
5464                 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
5465                  * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
5466                  * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
5467
5468                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
5469                 /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
5470                  * the actual requested operations fail (or silently continue). */
5471                 if (r < 0 && context->private_users) {
5472                         *exit_status = EXIT_USER;
5473                         return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
5474                 }
5475                 if (r < 0)
5476                         log_unit_info_errno(unit, r, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
5477                 else
5478                         userns_set_up = true;
5479         }
5480
5481         if (exec_needs_network_namespace(context) && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
5482
5483                 /* Try to enable network namespacing if network namespacing is available and we have
5484                  * CAP_NET_ADMIN. We need CAP_NET_ADMIN to be able to configure the loopback device in the
5485                  * new network namespace. And if we don't have that, then we could only create a network
5486                  * namespace without the ability to set up "lo". Hence gracefully skip things then. */
5487                 if (ns_type_supported(NAMESPACE_NET) && have_effective_cap(CAP_NET_ADMIN) > 0) {
5488                         r = setup_shareable_ns(runtime->shared->netns_storage_socket, CLONE_NEWNET);
5489                         if (ERRNO_IS_NEG_PRIVILEGE(r))
5490                                 log_unit_notice_errno(unit, r,
5491                                                       "PrivateNetwork=yes is configured, but network namespace setup not permitted, proceeding without: %m");
5492                         else if (r < 0) {
5493                                 *exit_status = EXIT_NETWORK;
5494                                 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
5495                         }
5496                 } else if (context->network_namespace_path) {
5497                         *exit_status = EXIT_NETWORK;
5498                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
5499                                                     "NetworkNamespacePath= is not supported, refusing.");
5500                 } else
5501                         log_unit_notice(unit, "PrivateNetwork=yes is configured, but the kernel does not support or we lack privileges for network namespace, proceeding without.");
5502         }
5503
5504         if (exec_needs_ipc_namespace(context) && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
5505
5506                 if (ns_type_supported(NAMESPACE_IPC)) {
5507                         r = setup_shareable_ns(runtime->shared->ipcns_storage_socket, CLONE_NEWIPC);
5508                         if (r == -EPERM)
5509                                 log_unit_warning_errno(unit, r,
5510                                                        "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
5511                         else if (r < 0) {
5512                                 *exit_status = EXIT_NAMESPACE;
5513                                 return log_unit_error_errno(unit, r, "Failed to set up IPC namespacing: %m");
5514                         }
5515                 } else if (context->ipc_namespace_path) {
5516                         *exit_status = EXIT_NAMESPACE;
5517                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
5518                                                     "IPCNamespacePath= is not supported, refusing.");
5519                 } else
5520                         log_unit_warning(unit, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
5521         }
5522
5523         if (needs_mount_namespace) {
5524                 _cleanup_free_ char *error_path = NULL;
5525
5526                 r = apply_mount_namespace(unit, command->flags, context, params, runtime, memory_pressure_path, &error_path);
5527                 if (r < 0) {
5528                         *exit_status = EXIT_NAMESPACE;
5529                         return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
5530                                                     error_path ? ": " : "", strempty(error_path));
5531                 }
5532         }
5533
5534         if (needs_sandboxing) {
5535                 r = apply_protect_hostname(unit, context, exit_status);
5536                 if (r < 0)
5537                         return r;
5538         }
5539
5540         if (context->memory_ksm >= 0)
5541                 if (prctl(PR_SET_MEMORY_MERGE, context->memory_ksm) < 0) {
5542                         if (ERRNO_IS_NOT_SUPPORTED(errno))
5543                                 log_unit_debug_errno(unit, errno, "KSM support not available, ignoring.");
5544                         else {
5545                                 *exit_status = EXIT_KSM;
5546                                 return log_unit_error_errno(unit, errno, "Failed to set KSM: %m");
5547                         }
5548                 }
5549
5550         /* Drop groups as early as possible.
5551          * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
5552          * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
5553         if (needs_setuid) {
5554                 _cleanup_free_ gid_t *gids_to_enforce = NULL;
5555                 int ngids_to_enforce = 0;
5556
5557                 ngids_to_enforce = merge_gid_lists(supplementary_gids,
5558                                                    ngids,
5559                                                    gids_after_pam,
5560                                                    ngids_after_pam,
5561                                                    &gids_to_enforce);
5562                 if (ngids_to_enforce < 0) {
5563                         *exit_status = EXIT_MEMORY;
5564                         return log_unit_error_errno(unit,
5565                                                     ngids_to_enforce,
5566                                                     "Failed to merge group lists. Group membership might be incorrect: %m");
5567                 }
5568
5569                 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
5570                 if (r < 0) {
5571                         *exit_status = EXIT_GROUP;
5572                         return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
5573                 }
5574         }
5575
5576         /* If the user namespace was not set up above, try to do it now.
5577          * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
5578          * restricted by rules pertaining to combining user namespaces with other namespaces (e.g. in the
5579          * case of mount namespaces being less privileged when the mount point list is copied from a
5580          * different user namespace). */
5581
5582         if (needs_sandboxing && context->private_users && !userns_set_up) {
5583                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
5584                 if (r < 0) {
5585                         *exit_status = EXIT_USER;
5586                         return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
5587                 }
5588         }
5589
5590         /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
5591          * shall execute. */
5592
5593         _cleanup_free_ char *executable = NULL;
5594         _cleanup_close_ int executable_fd = -EBADF;
5595         r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
5596         if (r < 0) {
5597                 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
5598                         log_unit_struct_errno(unit, LOG_INFO, r,
5599                                               "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5600                                               LOG_UNIT_INVOCATION_ID(unit),
5601                                               LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
5602                                                                command->path),
5603                                               "EXECUTABLE=%s", command->path);
5604                         *exit_status = EXIT_SUCCESS;
5605                         return 0;
5606                 }
5607
5608                 *exit_status = EXIT_EXEC;
5609                 return log_unit_struct_errno(unit, LOG_INFO, r,
5610                                              "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5611                                              LOG_UNIT_INVOCATION_ID(unit),
5612                                              LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
5613                                                               command->path),
5614                                              "EXECUTABLE=%s", command->path);
5615         }
5616
5617         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
5618         if (r < 0) {
5619                 *exit_status = EXIT_FDS;
5620                 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
5621         }
5622
5623 #if HAVE_SELINUX
5624         if (needs_sandboxing && use_selinux && params->selinux_context_net) {
5625                 int fd = -EBADF;
5626
5627                 if (socket_fd >= 0)
5628                         fd = socket_fd;
5629                 else if (params->n_socket_fds == 1)
5630                         /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
5631                          * use context from that fd to compute the label. */
5632                         fd = params->fds[0];
5633
5634                 if (fd >= 0) {
5635                         r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
5636                         if (r < 0) {
5637                                 if (!context->selinux_context_ignore) {
5638                                         *exit_status = EXIT_SELINUX_CONTEXT;
5639                                         return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
5640                                 }
5641                                 log_unit_debug_errno(unit, r, "Failed to determine SELinux context, ignoring: %m");
5642                         }
5643                 }
5644         }
5645 #endif
5646
5647         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that
5648          * we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any
5649          * more. We do keep exec_fd however, if we have it, since we need to keep it open until the final
5650          * execve(). */
5651
5652         r = close_all_fds(keep_fds, n_keep_fds);
5653         if (r >= 0)
5654                 r = shift_fds(fds, n_fds);
5655         if (r >= 0)
5656                 r = flags_fds(fds, n_socket_fds, n_fds, context->non_blocking);
5657         if (r < 0) {
5658                 *exit_status = EXIT_FDS;
5659                 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
5660         }
5661
5662         /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
5663          * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
5664          * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
5665          * came this far. */
5666
5667         secure_bits = context->secure_bits;
5668
5669         if (needs_sandboxing) {
5670                 uint64_t bset;
5671
5672                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested.
5673                  * (Note this is placed after the general resource limit initialization, see above, in order
5674                  * to take precedence.) */
5675                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
5676                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
5677                                 *exit_status = EXIT_LIMITS;
5678                                 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
5679                         }
5680                 }
5681
5682 #if ENABLE_SMACK
5683                 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
5684                  * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
5685                 if (use_smack) {
5686                         r = setup_smack(unit->manager, context, executable_fd);
5687                         if (r < 0 && !context->smack_process_label_ignore) {
5688                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
5689                                 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
5690                         }
5691                 }
5692 #endif
5693
5694                 bset = context->capability_bounding_set;
5695                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
5696                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
5697                  * instead of us doing that */
5698                 if (needs_ambient_hack)
5699                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
5700                                 (UINT64_C(1) << CAP_SETUID) |
5701                                 (UINT64_C(1) << CAP_SETGID);
5702
5703                 if (!cap_test_all(bset)) {
5704                         r = capability_bounding_set_drop(bset, /* right_now= */ false);
5705                         if (r < 0) {
5706                                 *exit_status = EXIT_CAPABILITIES;
5707                                 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
5708                         }
5709                 }
5710
5711                 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
5712                  * keep-caps set.
5713                  *
5714                  * To be able to raise the ambient capabilities after setresuid() they have to be added to
5715                  * the inherited set and keep caps has to be set (done in enforce_user()).  After setresuid()
5716                  * the ambient capabilities can be raised as they are present in the permitted and
5717                  * inhertiable set. However it is possible that someone wants to set ambient capabilities
5718                  * without changing the user, so we also set the ambient capabilities here.
5719                  *
5720                  * The requested ambient capabilities are raised in the inheritable set if the second
5721                  * argument is true. */
5722                 if (!needs_ambient_hack) {
5723                         r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ true);
5724                         if (r < 0) {
5725                                 *exit_status = EXIT_CAPABILITIES;
5726                                 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
5727                         }
5728                 }
5729         }
5730
5731         /* chroot to root directory first, before we lose the ability to chroot */
5732         r = apply_root_directory(context, params, runtime, needs_mount_namespace, exit_status);
5733         if (r < 0)
5734                 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
5735
5736         if (needs_setuid) {
5737                 if (uid_is_valid(uid)) {
5738                         r = enforce_user(context, uid, capability_ambient_set);
5739                         if (r < 0) {
5740                                 *exit_status = EXIT_USER;
5741                                 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
5742                         }
5743
5744                         if (!needs_ambient_hack && capability_ambient_set != 0) {
5745
5746                                 /* Raise the ambient capabilities after user change. */
5747                                 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ false);
5748                                 if (r < 0) {
5749                                         *exit_status = EXIT_CAPABILITIES;
5750                                         return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
5751                                 }
5752                         }
5753                 }
5754         }
5755
5756         /* Apply working directory here, because the working directory might be on NFS and only the user running
5757          * this service might have the correct privilege to change to the working directory */
5758         r = apply_working_directory(context, params, runtime, home, exit_status);
5759         if (r < 0)
5760                 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
5761
5762         if (needs_sandboxing) {
5763                 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
5764                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
5765                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
5766                  * are restricted. */
5767
5768 #if HAVE_SELINUX
5769                 if (use_selinux) {
5770                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
5771
5772                         if (exec_context) {
5773                                 r = setexeccon(exec_context);
5774                                 if (r < 0) {
5775                                         if (!context->selinux_context_ignore) {
5776                                                 *exit_status = EXIT_SELINUX_CONTEXT;
5777                                                 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
5778                                         }
5779                                         log_unit_debug_errno(unit, r, "Failed to change SELinux context to %s, ignoring: %m", exec_context);
5780                                 }
5781                         }
5782                 }
5783 #endif
5784
5785 #if HAVE_APPARMOR
5786                 if (use_apparmor && context->apparmor_profile) {
5787                         r = aa_change_onexec(context->apparmor_profile);
5788                         if (r < 0 && !context->apparmor_profile_ignore) {
5789                                 *exit_status = EXIT_APPARMOR_PROFILE;
5790                                 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
5791                         }
5792                 }
5793 #endif
5794
5795                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential
5796                  * EPERMs we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits
5797                  * requires CAP_SETPCAP. */
5798                 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
5799                         /* CAP_SETPCAP is required to set securebits. This capability is raised into the
5800                          * effective set here.
5801                          *
5802                          * The effective set is overwritten during execve() with the following values:
5803                          *
5804                          * - ambient set (for non-root processes)
5805                          *
5806                          * - (inheritable | bounding) set for root processes)
5807                          *
5808                          * Hence there is no security impact to raise it in the effective set before execve
5809                          */
5810                         r = capability_gain_cap_setpcap(/* return_caps= */ NULL);
5811                         if (r < 0) {
5812                                 *exit_status = EXIT_CAPABILITIES;
5813                                 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
5814                         }
5815                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
5816                                 *exit_status = EXIT_SECUREBITS;
5817                                 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
5818                         }
5819                 }
5820
5821                 if (context_has_no_new_privileges(context))
5822                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
5823                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
5824                                 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
5825                         }
5826
5827 #if HAVE_SECCOMP
5828                 r = apply_address_families(unit, context);
5829                 if (r < 0) {
5830                         *exit_status = EXIT_ADDRESS_FAMILIES;
5831                         return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
5832                 }
5833
5834                 r = apply_memory_deny_write_execute(unit, context);
5835                 if (r < 0) {
5836                         *exit_status = EXIT_SECCOMP;
5837                         return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
5838                 }
5839
5840                 r = apply_restrict_realtime(unit, context);
5841                 if (r < 0) {
5842                         *exit_status = EXIT_SECCOMP;
5843                         return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
5844                 }
5845
5846                 r = apply_restrict_suid_sgid(unit, context);
5847                 if (r < 0) {
5848                         *exit_status = EXIT_SECCOMP;
5849                         return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
5850                 }
5851
5852                 r = apply_restrict_namespaces(unit, context);
5853                 if (r < 0) {
5854                         *exit_status = EXIT_SECCOMP;
5855                         return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
5856                 }
5857
5858                 r = apply_protect_sysctl(unit, context);
5859                 if (r < 0) {
5860                         *exit_status = EXIT_SECCOMP;
5861                         return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
5862                 }
5863
5864                 r = apply_protect_kernel_modules(unit, context);
5865                 if (r < 0) {
5866                         *exit_status = EXIT_SECCOMP;
5867                         return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
5868                 }
5869
5870                 r = apply_protect_kernel_logs(unit, context);
5871                 if (r < 0) {
5872                         *exit_status = EXIT_SECCOMP;
5873                         return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
5874                 }
5875
5876                 r = apply_protect_clock(unit, context);
5877                 if (r < 0) {
5878                         *exit_status = EXIT_SECCOMP;
5879                         return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
5880                 }
5881
5882                 r = apply_private_devices(unit, context);
5883                 if (r < 0) {
5884                         *exit_status = EXIT_SECCOMP;
5885                         return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
5886                 }
5887
5888                 r = apply_syscall_archs(unit, context);
5889                 if (r < 0) {
5890                         *exit_status = EXIT_SECCOMP;
5891                         return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
5892                 }
5893
5894                 r = apply_lock_personality(unit, context);
5895                 if (r < 0) {
5896                         *exit_status = EXIT_SECCOMP;
5897                         return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
5898                 }
5899
5900                 r = apply_syscall_log(unit, context);
5901                 if (r < 0) {
5902                         *exit_status = EXIT_SECCOMP;
5903                         return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
5904                 }
5905
5906                 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
5907                  * by the filter as little as possible. */
5908                 r = apply_syscall_filter(unit, context, needs_ambient_hack);
5909                 if (r < 0) {
5910                         *exit_status = EXIT_SECCOMP;
5911                         return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
5912                 }
5913 #endif
5914
5915 #if HAVE_LIBBPF
5916                 r = apply_restrict_filesystems(unit, context);
5917                 if (r < 0) {
5918                         *exit_status = EXIT_BPF;
5919                         return log_unit_error_errno(unit, r, "Failed to restrict filesystems: %m");
5920                 }
5921 #endif
5922
5923         }
5924
5925         if (!strv_isempty(context->unset_environment)) {
5926                 char **ee = NULL;
5927
5928                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
5929                 if (!ee) {
5930                         *exit_status = EXIT_MEMORY;
5931                         return log_oom();
5932                 }
5933
5934                 strv_free_and_replace(accum_env, ee);
5935         }
5936
5937         if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
5938                 _cleanup_strv_free_ char **unset_variables = NULL, **bad_variables = NULL;
5939
5940                 r = replace_env_argv(command->argv, accum_env, &replaced_argv, &unset_variables, &bad_variables);
5941                 if (r < 0) {
5942                         *exit_status = EXIT_MEMORY;
5943                         return log_unit_error_errno(unit, r, "Failed to replace environment variables: %m");
5944                 }
5945                 final_argv = replaced_argv;
5946
5947                 if (!strv_isempty(unset_variables)) {
5948                         _cleanup_free_ char *ju = strv_join(unset_variables, ", ");
5949                         log_unit_warning(unit, "Referenced but unset environment variable evaluates to an empty string: %s", strna(ju));
5950                 }
5951
5952                 if (!strv_isempty(bad_variables)) {
5953                         _cleanup_free_ char *jb = strv_join(bad_variables, ", ");
5954                         log_unit_warning(unit, "Invalid environment variable name evaluates to an empty string: %s", strna(jb));;
5955                 }
5956         } else
5957                 final_argv = command->argv;
5958
5959         log_command_line(unit, "Executing", executable, final_argv);
5960
5961         if (exec_fd >= 0) {
5962                 uint8_t hot = 1;
5963
5964                 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5965                  * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5966
5967                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5968                         *exit_status = EXIT_EXEC;
5969                         return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
5970                 }
5971         }
5972
5973         r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5974
5975         if (exec_fd >= 0) {
5976                 uint8_t hot = 0;
5977
5978                 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5979                  * that POLLHUP on it no longer means execve() succeeded. */
5980
5981                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5982                         *exit_status = EXIT_EXEC;
5983                         return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
5984                 }
5985         }
5986
5987         *exit_status = EXIT_EXEC;
5988         return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
5989 }
5990
5991 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
5992 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
5993
5994 int exec_spawn(Unit *unit,
5995                ExecCommand *command,
5996                const ExecContext *context,
5997                const ExecParameters *params,
5998                ExecRuntime *runtime,
5999                const CGroupContext *cgroup_context,
6000                pid_t *ret) {
6001
6002         int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
6003         _cleanup_free_ char *subcgroup_path = NULL;
6004         _cleanup_strv_free_ char **files_env = NULL;
6005         size_t n_storage_fds = 0, n_socket_fds = 0;
6006         pid_t pid;
6007
6008         assert(unit);
6009         assert(command);
6010         assert(context);
6011         assert(ret);
6012         assert(params);
6013         assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
6014
6015         LOG_CONTEXT_PUSH_UNIT(unit);
6016
6017         if (context->std_input == EXEC_INPUT_SOCKET ||
6018             context->std_output == EXEC_OUTPUT_SOCKET ||
6019             context->std_error == EXEC_OUTPUT_SOCKET) {
6020
6021                 if (params->n_socket_fds > 1)
6022                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
6023
6024                 if (params->n_socket_fds == 0)
6025                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
6026
6027                 socket_fd = params->fds[0];
6028         } else {
6029                 socket_fd = -EBADF;
6030                 fds = params->fds;
6031                 n_socket_fds = params->n_socket_fds;
6032                 n_storage_fds = params->n_storage_fds;
6033         }
6034
6035         r = exec_context_named_iofds(context, params, named_iofds);
6036         if (r < 0)
6037                 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
6038
6039         r = exec_context_load_environment(unit, context, &files_env);
6040         if (r < 0)
6041                 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
6042
6043         /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
6044            and, until the next SELinux policy changes, we save further reloads in future children. */
6045         mac_selinux_maybe_reload();
6046
6047         /* We won't know the real executable path until we create the mount namespace in the child, but we
6048            want to log from the parent, so we use the possibly inaccurate path here. */
6049         log_command_line(unit, "About to execute", command->path, command->argv);
6050
6051         if (params->cgroup_path) {
6052                 r = exec_parameters_get_cgroup_path(params, cgroup_context, &subcgroup_path);
6053                 if (r < 0)
6054                         return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
6055                 if (r > 0) {
6056                         /* If there's a subcgroup, then let's create it here now (the main cgroup was already
6057                          * realized by the unit logic) */
6058
6059                         r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
6060                         if (r < 0)
6061                                 return log_unit_error_errno(unit, r, "Failed to create subcgroup '%s': %m", subcgroup_path);
6062                 }
6063         }
6064
6065         pid = fork();
6066         if (pid < 0)
6067                 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
6068
6069         if (pid == 0) {
6070                 int exit_status;
6071
6072                 r = exec_child(unit,
6073                                command,
6074                                context,
6075                                params,
6076                                runtime,
6077                                cgroup_context,
6078                                socket_fd,
6079                                named_iofds,
6080                                fds,
6081                                n_socket_fds,
6082                                n_storage_fds,
6083                                files_env,
6084                                unit->manager->user_lookup_fds[1],
6085                                &exit_status);
6086
6087                 if (r < 0) {
6088                         const char *status = ASSERT_PTR(
6089                                         exit_status_to_string(exit_status, EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD));
6090
6091                         log_unit_struct_errno(unit, LOG_ERR, r,
6092                                               "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
6093                                               LOG_UNIT_INVOCATION_ID(unit),
6094                                               LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
6095                                                                status, command->path),
6096                                               "EXECUTABLE=%s", command->path);
6097                 } else
6098                         assert(exit_status == EXIT_SUCCESS);
6099
6100                 _exit(exit_status);
6101         }
6102
6103         log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
6104
6105         /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
6106          * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
6107          * process will be killed too). */
6108         if (subcgroup_path)
6109                 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
6110
6111         exec_status_start(&command->exec_status, pid);
6112
6113         *ret = pid;
6114         return 0;
6115 }
6116
6117 void exec_context_init(ExecContext *c) {
6118         assert(c);
6119
6120         c->umask = 0022;
6121         c->ioprio = IOPRIO_DEFAULT_CLASS_AND_PRIO;
6122         c->cpu_sched_policy = SCHED_OTHER;
6123         c->syslog_priority = LOG_DAEMON|LOG_INFO;
6124         c->syslog_level_prefix = true;
6125         c->ignore_sigpipe = true;
6126         c->timer_slack_nsec = NSEC_INFINITY;
6127         c->personality = PERSONALITY_INVALID;
6128         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
6129                 c->directories[t].mode = 0755;
6130         c->timeout_clean_usec = USEC_INFINITY;
6131         c->capability_bounding_set = CAP_MASK_UNSET;
6132         assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
6133         c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
6134         c->log_level_max = -1;
6135 #if HAVE_SECCOMP
6136         c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL;
6137 #endif
6138         c->tty_rows = UINT_MAX;
6139         c->tty_cols = UINT_MAX;
6140         numa_policy_reset(&c->numa_policy);
6141         c->private_mounts = -1;
6142         c->memory_ksm = -1;
6143 }
6144
6145 void exec_context_done(ExecContext *c) {
6146         assert(c);
6147
6148         c->environment = strv_free(c->environment);
6149         c->environment_files = strv_free(c->environment_files);
6150         c->pass_environment = strv_free(c->pass_environment);
6151         c->unset_environment = strv_free(c->unset_environment);
6152
6153         rlimit_free_all(c->rlimit);
6154
6155         for (size_t l = 0; l < 3; l++) {
6156                 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
6157                 c->stdio_file[l] = mfree(c->stdio_file[l]);
6158         }
6159
6160         c->working_directory = mfree(c->working_directory);
6161         c->root_directory = mfree(c->root_directory);
6162         c->root_image = mfree(c->root_image);
6163         c->root_image_options = mount_options_free_all(c->root_image_options);
6164         c->root_hash = mfree(c->root_hash);
6165         c->root_hash_size = 0;
6166         c->root_hash_path = mfree(c->root_hash_path);
6167         c->root_hash_sig = mfree(c->root_hash_sig);
6168         c->root_hash_sig_size = 0;
6169         c->root_hash_sig_path = mfree(c->root_hash_sig_path);
6170         c->root_verity = mfree(c->root_verity);
6171         c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
6172         c->extension_directories = strv_free(c->extension_directories);
6173         c->tty_path = mfree(c->tty_path);
6174         c->syslog_identifier = mfree(c->syslog_identifier);
6175         c->user = mfree(c->user);
6176         c->group = mfree(c->group);
6177
6178         c->supplementary_groups = strv_free(c->supplementary_groups);
6179
6180         c->pam_name = mfree(c->pam_name);
6181
6182         c->read_only_paths = strv_free(c->read_only_paths);
6183         c->read_write_paths = strv_free(c->read_write_paths);
6184         c->inaccessible_paths = strv_free(c->inaccessible_paths);
6185         c->exec_paths = strv_free(c->exec_paths);
6186         c->no_exec_paths = strv_free(c->no_exec_paths);
6187         c->exec_search_path = strv_free(c->exec_search_path);
6188
6189         bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
6190         c->bind_mounts = NULL;
6191         c->n_bind_mounts = 0;
6192         temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
6193         c->temporary_filesystems = NULL;
6194         c->n_temporary_filesystems = 0;
6195         c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
6196
6197         cpu_set_reset(&c->cpu_set);
6198         numa_policy_reset(&c->numa_policy);
6199
6200         c->utmp_id = mfree(c->utmp_id);
6201         c->selinux_context = mfree(c->selinux_context);
6202         c->apparmor_profile = mfree(c->apparmor_profile);
6203         c->smack_process_label = mfree(c->smack_process_label);
6204
6205         c->restrict_filesystems = set_free_free(c->restrict_filesystems);
6206
6207         c->syscall_filter = hashmap_free(c->syscall_filter);
6208         c->syscall_archs = set_free(c->syscall_archs);
6209         c->address_families = set_free(c->address_families);
6210
6211         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
6212                 exec_directory_done(&c->directories[t]);
6213
6214         c->log_level_max = -1;
6215
6216         exec_context_free_log_extra_fields(c);
6217         c->log_filter_allowed_patterns = set_free_free(c->log_filter_allowed_patterns);
6218         c->log_filter_denied_patterns = set_free_free(c->log_filter_denied_patterns);
6219
6220         c->log_ratelimit_interval_usec = 0;
6221         c->log_ratelimit_burst = 0;
6222
6223         c->stdin_data = mfree(c->stdin_data);
6224         c->stdin_data_size = 0;
6225
6226         c->network_namespace_path = mfree(c->network_namespace_path);
6227         c->ipc_namespace_path = mfree(c->ipc_namespace_path);
6228
6229         c->log_namespace = mfree(c->log_namespace);
6230
6231         c->load_credentials = hashmap_free(c->load_credentials);
6232         c->set_credentials = hashmap_free(c->set_credentials);
6233         c->import_credentials = set_free_free(c->import_credentials);
6234
6235         c->root_image_policy = image_policy_free(c->root_image_policy);
6236         c->mount_image_policy = image_policy_free(c->mount_image_policy);
6237         c->extension_image_policy = image_policy_free(c->extension_image_policy);
6238 }
6239
6240 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
6241         assert(c);
6242
6243         if (!runtime_prefix)
6244                 return 0;
6245
6246         for (size_t i = 0; i < c->directories[EXEC_DIRECTORY_RUNTIME].n_items; i++) {
6247                 _cleanup_free_ char *p = NULL;
6248
6249                 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
6250                         p = path_join(runtime_prefix, "private", c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
6251                 else
6252                         p = path_join(runtime_prefix, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
6253                 if (!p)
6254                         return -ENOMEM;
6255
6256                 /* We execute this synchronously, since we need to be sure this is gone when we start the
6257                  * service next. */
6258                 (void) rm_rf(p, REMOVE_ROOT);
6259
6260                 STRV_FOREACH(symlink, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].symlinks) {
6261                         _cleanup_free_ char *symlink_abs = NULL;
6262
6263                         if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
6264                                 symlink_abs = path_join(runtime_prefix, "private", *symlink);
6265                         else
6266                                 symlink_abs = path_join(runtime_prefix, *symlink);
6267                         if (!symlink_abs)
6268                                 return -ENOMEM;
6269
6270                         (void) unlink(symlink_abs);
6271                 }
6272         }
6273
6274         return 0;
6275 }
6276
6277 int exec_context_destroy_credentials(const ExecContext *c, const char *runtime_prefix, const char *unit) {
6278         _cleanup_free_ char *p = NULL;
6279
6280         assert(c);
6281
6282         if (!runtime_prefix || !unit)
6283                 return 0;
6284
6285         p = path_join(runtime_prefix, "credentials", unit);
6286         if (!p)
6287                 return -ENOMEM;
6288
6289         /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
6290          * unmount it, and afterwards remove the mount point */
6291         (void) umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW);
6292         (void) rm_rf(p, REMOVE_ROOT|REMOVE_CHMOD);
6293
6294         return 0;
6295 }
6296
6297 int exec_context_destroy_mount_ns_dir(Unit *u) {
6298         _cleanup_free_ char *p = NULL;
6299
6300         if (!u || !MANAGER_IS_SYSTEM(u->manager))
6301                 return 0;
6302
6303         p = path_join("/run/systemd/propagate/", u->id);
6304         if (!p)
6305                 return -ENOMEM;
6306
6307         /* This is only filled transiently (see mount_in_namespace()), should be empty or even non-existent*/
6308         if (rmdir(p) < 0 && errno != ENOENT)
6309                 log_unit_debug_errno(u, errno, "Unable to remove propagation dir '%s', ignoring: %m", p);
6310
6311         return 0;
6312 }
6313
6314 static void exec_command_done(ExecCommand *c) {
6315         assert(c);
6316
6317         c->path = mfree(c->path);
6318         c->argv = strv_free(c->argv);
6319 }
6320
6321 void exec_command_done_array(ExecCommand *c, size_t n) {
6322         for (size_t i = 0; i < n; i++)
6323                 exec_command_done(c+i);
6324 }
6325
6326 ExecCommand* exec_command_free_list(ExecCommand *c) {
6327         ExecCommand *i;
6328
6329         while ((i = c)) {
6330                 LIST_REMOVE(command, c, i);
6331                 exec_command_done(i);
6332                 free(i);
6333         }
6334
6335         return NULL;
6336 }
6337
6338 void exec_command_free_array(ExecCommand **c, size_t n) {
6339         for (size_t i = 0; i < n; i++)
6340                 c[i] = exec_command_free_list(c[i]);
6341 }
6342
6343 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
6344         for (size_t i = 0; i < n; i++)
6345                 exec_status_reset(&c[i].exec_status);
6346 }
6347
6348 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
6349         for (size_t i = 0; i < n; i++)
6350                 LIST_FOREACH(command, z, c[i])
6351                         exec_status_reset(&z->exec_status);
6352 }
6353
6354 typedef struct InvalidEnvInfo {
6355         const Unit *unit;
6356         const char *path;
6357 } InvalidEnvInfo;
6358
6359 static void invalid_env(const char *p, void *userdata) {
6360         InvalidEnvInfo *info = userdata;
6361
6362         log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
6363 }
6364
6365 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
6366         assert(c);
6367
6368         switch (fd_index) {
6369
6370         case STDIN_FILENO:
6371                 if (c->std_input != EXEC_INPUT_NAMED_FD)
6372                         return NULL;
6373
6374                 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
6375
6376         case STDOUT_FILENO:
6377                 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
6378                         return NULL;
6379
6380                 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
6381
6382         case STDERR_FILENO:
6383                 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
6384                         return NULL;
6385
6386                 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
6387
6388         default:
6389                 return NULL;
6390         }
6391 }
6392
6393 static int exec_context_named_iofds(
6394                 const ExecContext *c,
6395                 const ExecParameters *p,
6396                 int named_iofds[static 3]) {
6397
6398         size_t targets;
6399         const char* stdio_fdname[3];
6400         size_t n_fds;
6401
6402         assert(c);
6403         assert(p);
6404         assert(named_iofds);
6405
6406         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
6407                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
6408                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
6409
6410         for (size_t i = 0; i < 3; i++)
6411                 stdio_fdname[i] = exec_context_fdname(c, i);
6412
6413         n_fds = p->n_storage_fds + p->n_socket_fds;
6414
6415         for (size_t i = 0; i < n_fds  && targets > 0; i++)
6416                 if (named_iofds[STDIN_FILENO] < 0 &&
6417                     c->std_input == EXEC_INPUT_NAMED_FD &&
6418                     stdio_fdname[STDIN_FILENO] &&
6419                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
6420
6421                         named_iofds[STDIN_FILENO] = p->fds[i];
6422                         targets--;
6423
6424                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
6425                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
6426                            stdio_fdname[STDOUT_FILENO] &&
6427                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
6428
6429                         named_iofds[STDOUT_FILENO] = p->fds[i];
6430                         targets--;
6431
6432                 } else if (named_iofds[STDERR_FILENO] < 0 &&
6433                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
6434                            stdio_fdname[STDERR_FILENO] &&
6435                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
6436
6437                         named_iofds[STDERR_FILENO] = p->fds[i];
6438                         targets--;
6439                 }
6440
6441         return targets == 0 ? 0 : -ENOENT;
6442 }
6443
6444 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***ret) {
6445         _cleanup_strv_free_ char **v = NULL;
6446         int r;
6447
6448         assert(c);
6449         assert(ret);
6450
6451         STRV_FOREACH(i, c->environment_files) {
6452                 _cleanup_globfree_ glob_t pglob = {};
6453                 bool ignore = false;
6454                 char *fn = *i;
6455
6456                 if (fn[0] == '-') {
6457                         ignore = true;
6458                         fn++;
6459                 }
6460
6461                 if (!path_is_absolute(fn)) {
6462                         if (ignore)
6463                                 continue;
6464                         return -EINVAL;
6465                 }
6466
6467                 /* Filename supports globbing, take all matching files */
6468                 r = safe_glob(fn, 0, &pglob);
6469                 if (r < 0) {
6470                         if (ignore)
6471                                 continue;
6472                         return r;
6473                 }
6474
6475                 /* When we don't match anything, -ENOENT should be returned */
6476                 assert(pglob.gl_pathc > 0);
6477
6478                 for (size_t n = 0; n < pglob.gl_pathc; n++) {
6479                         _cleanup_strv_free_ char **p = NULL;
6480
6481                         r = load_env_file(NULL, pglob.gl_pathv[n], &p);
6482                         if (r < 0) {
6483                                 if (ignore)
6484                                         continue;
6485                                 return r;
6486                         }
6487
6488                         /* Log invalid environment variables with filename */
6489                         if (p) {
6490                                 InvalidEnvInfo info = {
6491                                         .unit = unit,
6492                                         .path = pglob.gl_pathv[n]
6493                                 };
6494
6495                                 p = strv_env_clean_with_callback(p, invalid_env, &info);
6496                         }
6497
6498                         if (!v)
6499                                 v = TAKE_PTR(p);
6500                         else {
6501                                 char **m = strv_env_merge(v, p);
6502                                 if (!m)
6503                                         return -ENOMEM;
6504
6505                                 strv_free_and_replace(v, m);
6506                         }
6507                 }
6508         }
6509
6510         *ret = TAKE_PTR(v);
6511
6512         return 0;
6513 }
6514
6515 static bool tty_may_match_dev_console(const char *tty) {
6516         _cleanup_free_ char *resolved = NULL;
6517
6518         if (!tty)
6519                 return true;
6520
6521         tty = skip_dev_prefix(tty);
6522
6523         /* trivial identity? */
6524         if (streq(tty, "console"))
6525                 return true;
6526
6527         if (resolve_dev_console(&resolved) < 0)
6528                 return true; /* if we could not resolve, assume it may */
6529
6530         /* "tty0" means the active VC, so it may be the same sometimes */
6531         return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
6532 }
6533
6534 static bool exec_context_may_touch_tty(const ExecContext *ec) {
6535         assert(ec);
6536
6537         return ec->tty_reset ||
6538                 ec->tty_vhangup ||
6539                 ec->tty_vt_disallocate ||
6540                 is_terminal_input(ec->std_input) ||
6541                 is_terminal_output(ec->std_output) ||
6542                 is_terminal_output(ec->std_error);
6543 }
6544
6545 bool exec_context_may_touch_console(const ExecContext *ec) {
6546
6547         return exec_context_may_touch_tty(ec) &&
6548                tty_may_match_dev_console(exec_context_tty_path(ec));
6549 }
6550
6551 static void strv_fprintf(FILE *f, char **l) {
6552         assert(f);
6553
6554         STRV_FOREACH(g, l)
6555                 fprintf(f, " %s", *g);
6556 }
6557
6558 static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
6559         assert(f);
6560         assert(prefix);
6561         assert(name);
6562
6563         if (!strv_isempty(strv)) {
6564                 fprintf(f, "%s%s:", prefix, name);
6565                 strv_fprintf(f, strv);
6566                 fputs("\n", f);
6567         }
6568 }
6569
6570 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
6571         int r;
6572
6573         assert(c);
6574         assert(f);
6575
6576         prefix = strempty(prefix);
6577
6578         fprintf(f,
6579                 "%sUMask: %04o\n"
6580                 "%sWorkingDirectory: %s\n"
6581                 "%sRootDirectory: %s\n"
6582                 "%sRootEphemeral: %s\n"
6583                 "%sNonBlocking: %s\n"
6584                 "%sPrivateTmp: %s\n"
6585                 "%sPrivateDevices: %s\n"
6586                 "%sProtectKernelTunables: %s\n"
6587                 "%sProtectKernelModules: %s\n"
6588                 "%sProtectKernelLogs: %s\n"
6589                 "%sProtectClock: %s\n"
6590                 "%sProtectControlGroups: %s\n"
6591                 "%sPrivateNetwork: %s\n"
6592                 "%sPrivateUsers: %s\n"
6593                 "%sProtectHome: %s\n"
6594                 "%sProtectSystem: %s\n"
6595                 "%sMountAPIVFS: %s\n"
6596                 "%sIgnoreSIGPIPE: %s\n"
6597                 "%sMemoryDenyWriteExecute: %s\n"
6598                 "%sRestrictRealtime: %s\n"
6599                 "%sRestrictSUIDSGID: %s\n"
6600                 "%sKeyringMode: %s\n"
6601                 "%sProtectHostname: %s\n"
6602                 "%sProtectProc: %s\n"
6603                 "%sProcSubset: %s\n",
6604                 prefix, c->umask,
6605                 prefix, empty_to_root(c->working_directory),
6606                 prefix, empty_to_root(c->root_directory),
6607                 prefix, yes_no(c->root_ephemeral),
6608                 prefix, yes_no(c->non_blocking),
6609                 prefix, yes_no(c->private_tmp),
6610                 prefix, yes_no(c->private_devices),
6611                 prefix, yes_no(c->protect_kernel_tunables),
6612                 prefix, yes_no(c->protect_kernel_modules),
6613                 prefix, yes_no(c->protect_kernel_logs),
6614                 prefix, yes_no(c->protect_clock),
6615                 prefix, yes_no(c->protect_control_groups),
6616                 prefix, yes_no(c->private_network),
6617                 prefix, yes_no(c->private_users),
6618                 prefix, protect_home_to_string(c->protect_home),
6619                 prefix, protect_system_to_string(c->protect_system),
6620                 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
6621                 prefix, yes_no(c->ignore_sigpipe),
6622                 prefix, yes_no(c->memory_deny_write_execute),
6623                 prefix, yes_no(c->restrict_realtime),
6624                 prefix, yes_no(c->restrict_suid_sgid),
6625                 prefix, exec_keyring_mode_to_string(c->keyring_mode),
6626                 prefix, yes_no(c->protect_hostname),
6627                 prefix, protect_proc_to_string(c->protect_proc),
6628                 prefix, proc_subset_to_string(c->proc_subset));
6629
6630         if (c->root_image)
6631                 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
6632
6633         if (c->root_image_options) {
6634                 fprintf(f, "%sRootImageOptions:", prefix);
6635                 LIST_FOREACH(mount_options, o, c->root_image_options)
6636                         if (!isempty(o->options))
6637                                 fprintf(f, " %s:%s",
6638                                         partition_designator_to_string(o->partition_designator),
6639                                         o->options);
6640                 fprintf(f, "\n");
6641         }
6642
6643         if (c->root_hash) {
6644                 _cleanup_free_ char *encoded = NULL;
6645                 encoded = hexmem(c->root_hash, c->root_hash_size);
6646                 if (encoded)
6647                         fprintf(f, "%sRootHash: %s\n", prefix, encoded);
6648         }
6649
6650         if (c->root_hash_path)
6651                 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
6652
6653         if (c->root_hash_sig) {
6654                 _cleanup_free_ char *encoded = NULL;
6655                 ssize_t len;
6656                 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
6657                 if (len)
6658                         fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
6659         }
6660
6661         if (c->root_hash_sig_path)
6662                 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
6663
6664         if (c->root_verity)
6665                 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
6666
6667         STRV_FOREACH(e, c->environment)
6668                 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
6669
6670         STRV_FOREACH(e, c->environment_files)
6671                 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
6672
6673         STRV_FOREACH(e, c->pass_environment)
6674                 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
6675
6676         STRV_FOREACH(e, c->unset_environment)
6677                 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
6678
6679         fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
6680
6681         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
6682                 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
6683
6684                 for (size_t i = 0; i < c->directories[dt].n_items; i++) {
6685                         fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].items[i].path);
6686
6687                         STRV_FOREACH(d, c->directories[dt].items[i].symlinks)
6688                                 fprintf(f, "%s%s: %s:%s\n", prefix, exec_directory_type_symlink_to_string(dt), c->directories[dt].items[i].path, *d);
6689                 }
6690         }
6691
6692         fprintf(f, "%sTimeoutCleanSec: %s\n", prefix, FORMAT_TIMESPAN(c->timeout_clean_usec, USEC_PER_SEC));
6693
6694         if (c->nice_set)
6695                 fprintf(f, "%sNice: %i\n", prefix, c->nice);
6696
6697         if (c->oom_score_adjust_set)
6698                 fprintf(f, "%sOOMScoreAdjust: %i\n", prefix, c->oom_score_adjust);
6699
6700         if (c->coredump_filter_set)
6701                 fprintf(f, "%sCoredumpFilter: 0x%"PRIx64"\n", prefix, c->coredump_filter);
6702
6703         for (unsigned i = 0; i < RLIM_NLIMITS; i++)
6704                 if (c->rlimit[i]) {
6705                         fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
6706                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
6707                         fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
6708                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
6709                 }
6710
6711         if (c->ioprio_set) {
6712                 _cleanup_free_ char *class_str = NULL;
6713
6714                 r = ioprio_class_to_string_alloc(ioprio_prio_class(c->ioprio), &class_str);
6715                 if (r >= 0)
6716                         fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
6717
6718                 fprintf(f, "%sIOPriority: %d\n", prefix, ioprio_prio_data(c->ioprio));
6719         }
6720
6721         if (c->cpu_sched_set) {
6722                 _cleanup_free_ char *policy_str = NULL;
6723
6724                 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
6725                 if (r >= 0)
6726                         fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
6727
6728                 fprintf(f,
6729                         "%sCPUSchedulingPriority: %i\n"
6730                         "%sCPUSchedulingResetOnFork: %s\n",
6731                         prefix, c->cpu_sched_priority,
6732                         prefix, yes_no(c->cpu_sched_reset_on_fork));
6733         }
6734
6735         if (c->cpu_set.set) {
6736                 _cleanup_free_ char *affinity = NULL;
6737
6738                 affinity = cpu_set_to_range_string(&c->cpu_set);
6739                 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
6740         }
6741
6742         if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
6743                 _cleanup_free_ char *nodes = NULL;
6744
6745                 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
6746                 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
6747                 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
6748         }
6749
6750         if (c->timer_slack_nsec != NSEC_INFINITY)
6751                 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
6752
6753         fprintf(f,
6754                 "%sStandardInput: %s\n"
6755                 "%sStandardOutput: %s\n"
6756                 "%sStandardError: %s\n",
6757                 prefix, exec_input_to_string(c->std_input),
6758                 prefix, exec_output_to_string(c->std_output),
6759                 prefix, exec_output_to_string(c->std_error));
6760
6761         if (c->std_input == EXEC_INPUT_NAMED_FD)
6762                 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
6763         if (c->std_output == EXEC_OUTPUT_NAMED_FD)
6764                 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
6765         if (c->std_error == EXEC_OUTPUT_NAMED_FD)
6766                 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
6767
6768         if (c->std_input == EXEC_INPUT_FILE)
6769                 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
6770         if (c->std_output == EXEC_OUTPUT_FILE)
6771                 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
6772         if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
6773                 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
6774         if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
6775                 fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
6776         if (c->std_error == EXEC_OUTPUT_FILE)
6777                 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
6778         if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
6779                 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
6780         if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
6781                 fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
6782
6783         if (c->tty_path)
6784                 fprintf(f,
6785                         "%sTTYPath: %s\n"
6786                         "%sTTYReset: %s\n"
6787                         "%sTTYVHangup: %s\n"
6788                         "%sTTYVTDisallocate: %s\n"
6789                         "%sTTYRows: %u\n"
6790                         "%sTTYColumns: %u\n",
6791                         prefix, c->tty_path,
6792                         prefix, yes_no(c->tty_reset),
6793                         prefix, yes_no(c->tty_vhangup),
6794                         prefix, yes_no(c->tty_vt_disallocate),
6795                         prefix, c->tty_rows,
6796                         prefix, c->tty_cols);
6797
6798         if (IN_SET(c->std_output,
6799                    EXEC_OUTPUT_KMSG,
6800                    EXEC_OUTPUT_JOURNAL,
6801                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
6802                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
6803             IN_SET(c->std_error,
6804                    EXEC_OUTPUT_KMSG,
6805                    EXEC_OUTPUT_JOURNAL,
6806                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
6807                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
6808
6809                 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
6810
6811                 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
6812                 if (r >= 0)
6813                         fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
6814
6815                 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
6816                 if (r >= 0)
6817                         fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
6818         }
6819
6820         if (c->log_level_max >= 0) {
6821                 _cleanup_free_ char *t = NULL;
6822
6823                 (void) log_level_to_string_alloc(c->log_level_max, &t);
6824
6825                 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
6826         }
6827
6828         if (c->log_ratelimit_interval_usec > 0)
6829                 fprintf(f,
6830                         "%sLogRateLimitIntervalSec: %s\n",
6831                         prefix, FORMAT_TIMESPAN(c->log_ratelimit_interval_usec, USEC_PER_SEC));
6832
6833         if (c->log_ratelimit_burst > 0)
6834                 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
6835
6836         if (!set_isempty(c->log_filter_allowed_patterns) || !set_isempty(c->log_filter_denied_patterns)) {
6837                 fprintf(f, "%sLogFilterPatterns:", prefix);
6838
6839                 char *pattern;
6840                 SET_FOREACH(pattern, c->log_filter_allowed_patterns)
6841                         fprintf(f, " %s", pattern);
6842                 SET_FOREACH(pattern, c->log_filter_denied_patterns)
6843                         fprintf(f, " ~%s", pattern);
6844                 fputc('\n', f);
6845         }
6846
6847         for (size_t j = 0; j < c->n_log_extra_fields; j++) {
6848                 fprintf(f, "%sLogExtraFields: ", prefix);
6849                 fwrite(c->log_extra_fields[j].iov_base,
6850                        1, c->log_extra_fields[j].iov_len,
6851                        f);
6852                 fputc('\n', f);
6853         }
6854
6855         if (c->log_namespace)
6856                 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
6857
6858         if (c->secure_bits) {
6859                 _cleanup_free_ char *str = NULL;
6860
6861                 r = secure_bits_to_string_alloc(c->secure_bits, &str);
6862                 if (r >= 0)
6863                         fprintf(f, "%sSecure Bits: %s\n", prefix, str);
6864         }
6865
6866         if (c->capability_bounding_set != CAP_MASK_UNSET) {
6867                 _cleanup_free_ char *str = NULL;
6868
6869                 r = capability_set_to_string(c->capability_bounding_set, &str);
6870                 if (r >= 0)
6871                         fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
6872         }
6873
6874         if (c->capability_ambient_set != 0) {
6875                 _cleanup_free_ char *str = NULL;
6876
6877                 r = capability_set_to_string(c->capability_ambient_set, &str);
6878                 if (r >= 0)
6879                         fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
6880         }
6881
6882         if (c->user)
6883                 fprintf(f, "%sUser: %s\n", prefix, c->user);
6884         if (c->group)
6885                 fprintf(f, "%sGroup: %s\n", prefix, c->group);
6886
6887         fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
6888
6889         strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
6890
6891         if (c->pam_name)
6892                 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
6893
6894         strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
6895         strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
6896         strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
6897         strv_dump(f, prefix, "ExecPaths", c->exec_paths);
6898         strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
6899         strv_dump(f, prefix, "ExecSearchPath", c->exec_search_path);
6900
6901         for (size_t i = 0; i < c->n_bind_mounts; i++)
6902                 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
6903                         c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
6904                         c->bind_mounts[i].ignore_enoent ? "-": "",
6905                         c->bind_mounts[i].source,
6906                         c->bind_mounts[i].destination,
6907                         c->bind_mounts[i].recursive ? "rbind" : "norbind");
6908
6909         for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
6910                 const TemporaryFileSystem *t = c->temporary_filesystems + i;
6911
6912                 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
6913                         t->path,
6914                         isempty(t->options) ? "" : ":",
6915                         strempty(t->options));
6916         }
6917
6918         if (c->utmp_id)
6919                 fprintf(f,
6920                         "%sUtmpIdentifier: %s\n",
6921                         prefix, c->utmp_id);
6922
6923         if (c->selinux_context)
6924                 fprintf(f,
6925                         "%sSELinuxContext: %s%s\n",
6926                         prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
6927
6928         if (c->apparmor_profile)
6929                 fprintf(f,
6930                         "%sAppArmorProfile: %s%s\n",
6931                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
6932
6933         if (c->smack_process_label)
6934                 fprintf(f,
6935                         "%sSmackProcessLabel: %s%s\n",
6936                         prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
6937
6938         if (c->personality != PERSONALITY_INVALID)
6939                 fprintf(f,
6940                         "%sPersonality: %s\n",
6941                         prefix, strna(personality_to_string(c->personality)));
6942
6943         fprintf(f,
6944                 "%sLockPersonality: %s\n",
6945                 prefix, yes_no(c->lock_personality));
6946
6947         if (c->syscall_filter) {
6948                 fprintf(f,
6949                         "%sSystemCallFilter: ",
6950                         prefix);
6951
6952                 if (!c->syscall_allow_list)
6953                         fputc('~', f);
6954
6955 #if HAVE_SECCOMP
6956                 void *id, *val;
6957                 bool first = true;
6958                 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
6959                         _cleanup_free_ char *name = NULL;
6960                         const char *errno_name = NULL;
6961                         int num = PTR_TO_INT(val);
6962
6963                         if (first)
6964                                 first = false;
6965                         else
6966                                 fputc(' ', f);
6967
6968                         name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
6969                         fputs(strna(name), f);
6970
6971                         if (num >= 0) {
6972                                 errno_name = seccomp_errno_or_action_to_string(num);
6973                                 if (errno_name)
6974                                         fprintf(f, ":%s", errno_name);
6975                                 else
6976                                         fprintf(f, ":%d", num);
6977                         }
6978                 }
6979 #endif
6980
6981                 fputc('\n', f);
6982         }
6983
6984         if (c->syscall_archs) {
6985                 fprintf(f,
6986                         "%sSystemCallArchitectures:",
6987                         prefix);
6988
6989 #if HAVE_SECCOMP
6990                 void *id;
6991                 SET_FOREACH(id, c->syscall_archs)
6992                         fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
6993 #endif
6994                 fputc('\n', f);
6995         }
6996
6997         if (exec_context_restrict_namespaces_set(c)) {
6998                 _cleanup_free_ char *s = NULL;
6999
7000                 r = namespace_flags_to_string(c->restrict_namespaces, &s);
7001                 if (r >= 0)
7002                         fprintf(f, "%sRestrictNamespaces: %s\n",
7003                                 prefix, strna(s));
7004         }
7005
7006 #if HAVE_LIBBPF
7007         if (exec_context_restrict_filesystems_set(c)) {
7008                 char *fs;
7009                 SET_FOREACH(fs, c->restrict_filesystems)
7010                         fprintf(f, "%sRestrictFileSystems: %s\n", prefix, fs);
7011         }
7012 #endif
7013
7014         if (c->network_namespace_path)
7015                 fprintf(f,
7016                         "%sNetworkNamespacePath: %s\n",
7017                         prefix, c->network_namespace_path);
7018
7019         if (c->syscall_errno > 0) {
7020                 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
7021
7022 #if HAVE_SECCOMP
7023                 const char *errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
7024                 if (errno_name)
7025                         fputs(errno_name, f);
7026                 else
7027                         fprintf(f, "%d", c->syscall_errno);
7028 #endif
7029                 fputc('\n', f);
7030         }
7031
7032         for (size_t i = 0; i < c->n_mount_images; i++) {
7033                 fprintf(f, "%sMountImages: %s%s:%s", prefix,
7034                         c->mount_images[i].ignore_enoent ? "-": "",
7035                         c->mount_images[i].source,
7036                         c->mount_images[i].destination);
7037                 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
7038                         fprintf(f, ":%s:%s",
7039                                 partition_designator_to_string(o->partition_designator),
7040                                 strempty(o->options));
7041                 fprintf(f, "\n");
7042         }
7043
7044         for (size_t i = 0; i < c->n_extension_images; i++) {
7045                 fprintf(f, "%sExtensionImages: %s%s", prefix,
7046                         c->extension_images[i].ignore_enoent ? "-": "",
7047                         c->extension_images[i].source);
7048                 LIST_FOREACH(mount_options, o, c->extension_images[i].mount_options)
7049                         fprintf(f, ":%s:%s",
7050                                 partition_designator_to_string(o->partition_designator),
7051                                 strempty(o->options));
7052                 fprintf(f, "\n");
7053         }
7054
7055         strv_dump(f, prefix, "ExtensionDirectories", c->extension_directories);
7056 }
7057
7058 bool exec_context_maintains_privileges(const ExecContext *c) {
7059         assert(c);
7060
7061         /* Returns true if the process forked off would run under
7062          * an unchanged UID or as root. */
7063
7064         if (!c->user)
7065                 return true;
7066
7067         if (streq(c->user, "root") || streq(c->user, "0"))
7068                 return true;
7069
7070         return false;
7071 }
7072
7073 int exec_context_get_effective_ioprio(const ExecContext *c) {
7074         int p;
7075
7076         assert(c);
7077
7078         if (c->ioprio_set)
7079                 return c->ioprio;
7080
7081         p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
7082         if (p < 0)
7083                 return IOPRIO_DEFAULT_CLASS_AND_PRIO;
7084
7085         return ioprio_normalize(p);
7086 }
7087
7088 bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
7089         assert(c);
7090
7091         /* Explicit setting wins */
7092         if (c->mount_apivfs_set)
7093                 return c->mount_apivfs;
7094
7095         /* Default to "yes" if root directory or image are specified */
7096         if (exec_context_with_rootfs(c))
7097                 return true;
7098
7099         return false;
7100 }
7101
7102 void exec_context_free_log_extra_fields(ExecContext *c) {
7103         assert(c);
7104
7105         for (size_t l = 0; l < c->n_log_extra_fields; l++)
7106                 free(c->log_extra_fields[l].iov_base);
7107         c->log_extra_fields = mfree(c->log_extra_fields);
7108         c->n_log_extra_fields = 0;
7109 }
7110
7111 void exec_context_revert_tty(ExecContext *c) {
7112         _cleanup_close_ int fd = -EBADF;
7113         const char *path;
7114         struct stat st;
7115         int r;
7116
7117         assert(c);
7118
7119         /* First, reset the TTY (possibly kicking everybody else from the TTY) */
7120         exec_context_tty_reset(c, NULL);
7121
7122         /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
7123          * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
7124          * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
7125         if (!exec_context_may_touch_tty(c))
7126                 return;
7127
7128         path = exec_context_tty_path(c);
7129         if (!path)
7130                 return;
7131
7132         fd = open(path, O_PATH|O_CLOEXEC);
7133         if (fd < 0)
7134                 return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
7135                                              "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
7136                                              path);
7137
7138         if (fstat(fd, &st) < 0)
7139                 return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
7140
7141         /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
7142          * if things are a character device, since a proper check either means we'd have to open the TTY and
7143          * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
7144          * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
7145          * with this at all? → https://github.com/systemd/systemd/issues/19213 */
7146         if (!S_ISCHR(st.st_mode))
7147                 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
7148
7149         r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
7150         if (r < 0)
7151                 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
7152 }
7153
7154 int exec_context_get_clean_directories(
7155                 ExecContext *c,
7156                 char **prefix,
7157                 ExecCleanMask mask,
7158                 char ***ret) {
7159
7160         _cleanup_strv_free_ char **l = NULL;
7161         int r;
7162
7163         assert(c);
7164         assert(prefix);
7165         assert(ret);
7166
7167         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
7168                 if (!FLAGS_SET(mask, 1U << t))
7169                         continue;
7170
7171                 if (!prefix[t])
7172                         continue;
7173
7174                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
7175                         char *j;
7176
7177                         j = path_join(prefix[t], c->directories[t].items[i].path);
7178                         if (!j)
7179                                 return -ENOMEM;
7180
7181                         r = strv_consume(&l, j);
7182                         if (r < 0)
7183                                 return r;
7184
7185                         /* Also remove private directories unconditionally. */
7186                         if (t != EXEC_DIRECTORY_CONFIGURATION) {
7187                                 j = path_join(prefix[t], "private", c->directories[t].items[i].path);
7188                                 if (!j)
7189                                         return -ENOMEM;
7190
7191                                 r = strv_consume(&l, j);
7192                                 if (r < 0)
7193                                         return r;
7194                         }
7195
7196                         STRV_FOREACH(symlink, c->directories[t].items[i].symlinks) {
7197                                 j = path_join(prefix[t], *symlink);
7198                                 if (!j)
7199                                         return -ENOMEM;
7200
7201                                 r = strv_consume(&l, j);
7202                                 if (r < 0)
7203                                         return r;
7204                         }
7205                 }
7206         }
7207
7208         *ret = TAKE_PTR(l);
7209         return 0;
7210 }
7211
7212 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
7213         ExecCleanMask mask = 0;
7214
7215         assert(c);
7216         assert(ret);
7217
7218         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
7219                 if (c->directories[t].n_items > 0)
7220                         mask |= 1U << t;
7221
7222         *ret = mask;
7223         return 0;
7224 }
7225
7226 bool exec_context_has_encrypted_credentials(ExecContext *c) {
7227         ExecLoadCredential *load_cred;
7228         ExecSetCredential *set_cred;
7229
7230         assert(c);
7231
7232         HASHMAP_FOREACH(load_cred, c->load_credentials)
7233                 if (load_cred->encrypted)
7234                         return true;
7235
7236         HASHMAP_FOREACH(set_cred, c->set_credentials)
7237                 if (set_cred->encrypted)
7238                         return true;
7239
7240         return false;
7241 }
7242
7243 void exec_status_start(ExecStatus *s, pid_t pid) {
7244         assert(s);
7245
7246         *s = (ExecStatus) {
7247                 .pid = pid,
7248         };
7249
7250         dual_timestamp_get(&s->start_timestamp);
7251 }
7252
7253 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
7254         assert(s);
7255
7256         if (s->pid != pid)
7257                 *s = (ExecStatus) {
7258                         .pid = pid,
7259                 };
7260
7261         dual_timestamp_get(&s->exit_timestamp);
7262
7263         s->code = code;
7264         s->status = status;
7265
7266         if (context && context->utmp_id)
7267                 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
7268 }
7269
7270 void exec_status_reset(ExecStatus *s) {
7271         assert(s);
7272
7273         *s = (ExecStatus) {};
7274 }
7275
7276 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
7277         assert(s);
7278         assert(f);
7279
7280         if (s->pid <= 0)
7281                 return;
7282
7283         prefix = strempty(prefix);
7284
7285         fprintf(f,
7286                 "%sPID: "PID_FMT"\n",
7287                 prefix, s->pid);
7288
7289         if (dual_timestamp_is_set(&s->start_timestamp))
7290                 fprintf(f,
7291                         "%sStart Timestamp: %s\n",
7292                         prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
7293
7294         if (dual_timestamp_is_set(&s->exit_timestamp))
7295                 fprintf(f,
7296                         "%sExit Timestamp: %s\n"
7297                         "%sExit Code: %s\n"
7298                         "%sExit Status: %i\n",
7299                         prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime),
7300                         prefix, sigchld_code_to_string(s->code),
7301                         prefix, s->status);
7302 }
7303
7304 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
7305         _cleanup_free_ char *cmd = NULL;
7306         const char *prefix2;
7307
7308         assert(c);
7309         assert(f);
7310
7311         prefix = strempty(prefix);
7312         prefix2 = strjoina(prefix, "\t");
7313
7314         cmd = quote_command_line(c->argv, SHELL_ESCAPE_EMPTY);
7315
7316         fprintf(f,
7317                 "%sCommand Line: %s\n",
7318                 prefix, strnull(cmd));
7319
7320         exec_status_dump(&c->exec_status, f, prefix2);
7321 }
7322
7323 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
7324         assert(f);
7325
7326         prefix = strempty(prefix);
7327
7328         LIST_FOREACH(command, i, c)
7329                 exec_command_dump(i, f, prefix);
7330 }
7331
7332 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
7333         ExecCommand *end;
7334
7335         assert(l);
7336         assert(e);
7337
7338         if (*l) {
7339                 /* It's kind of important, that we keep the order here */
7340                 end = LIST_FIND_TAIL(command, *l);
7341                 LIST_INSERT_AFTER(command, *l, end, e);
7342         } else
7343                 *l = e;
7344 }
7345
7346 int exec_command_set(ExecCommand *c, const char *path, ...) {
7347         va_list ap;
7348         char **l, *p;
7349
7350         assert(c);
7351         assert(path);
7352
7353         va_start(ap, path);
7354         l = strv_new_ap(path, ap);
7355         va_end(ap);
7356
7357         if (!l)
7358                 return -ENOMEM;
7359
7360         p = strdup(path);
7361         if (!p) {
7362                 strv_free(l);
7363                 return -ENOMEM;
7364         }
7365
7366         free_and_replace(c->path, p);
7367
7368         return strv_free_and_replace(c->argv, l);
7369 }
7370
7371 int exec_command_append(ExecCommand *c, const char *path, ...) {
7372         _cleanup_strv_free_ char **l = NULL;
7373         va_list ap;
7374         int r;
7375
7376         assert(c);
7377         assert(path);
7378
7379         va_start(ap, path);
7380         l = strv_new_ap(path, ap);
7381         va_end(ap);
7382
7383         if (!l)
7384                 return -ENOMEM;
7385
7386         r = strv_extend_strv(&c->argv, l, false);
7387         if (r < 0)
7388                 return r;
7389
7390         return 0;
7391 }
7392
7393 static char *destroy_tree(char *path) {
7394         if (!path)
7395                 return NULL;
7396
7397         if (!path_equal(path, RUN_SYSTEMD_EMPTY)) {
7398                 log_debug("Spawning process to nuke '%s'", path);
7399
7400                 (void) asynchronous_rm_rf(path, REMOVE_ROOT|REMOVE_SUBVOLUME|REMOVE_PHYSICAL);
7401         }
7402
7403         return mfree(path);
7404 }
7405
7406 static ExecSharedRuntime* exec_shared_runtime_free(ExecSharedRuntime *rt) {
7407         if (!rt)
7408                 return NULL;
7409
7410         if (rt->manager)
7411                 (void) hashmap_remove(rt->manager->exec_shared_runtime_by_id, rt->id);
7412
7413         rt->id = mfree(rt->id);
7414         rt->tmp_dir = mfree(rt->tmp_dir);
7415         rt->var_tmp_dir = mfree(rt->var_tmp_dir);
7416         safe_close_pair(rt->netns_storage_socket);
7417         safe_close_pair(rt->ipcns_storage_socket);
7418         return mfree(rt);
7419 }
7420
7421 DEFINE_TRIVIAL_UNREF_FUNC(ExecSharedRuntime, exec_shared_runtime, exec_shared_runtime_free);
7422 DEFINE_TRIVIAL_CLEANUP_FUNC(ExecSharedRuntime*, exec_shared_runtime_free);
7423
7424 ExecSharedRuntime* exec_shared_runtime_destroy(ExecSharedRuntime *rt) {
7425         if (!rt)
7426                 return NULL;
7427
7428         assert(rt->n_ref > 0);
7429         rt->n_ref--;
7430
7431         if (rt->n_ref > 0)
7432                 return NULL;
7433
7434         rt->tmp_dir = destroy_tree(rt->tmp_dir);
7435         rt->var_tmp_dir = destroy_tree(rt->var_tmp_dir);
7436
7437         return exec_shared_runtime_free(rt);
7438 }
7439
7440 static int exec_shared_runtime_allocate(ExecSharedRuntime **ret, const char *id) {
7441         _cleanup_free_ char *id_copy = NULL;
7442         ExecSharedRuntime *n;
7443
7444         assert(ret);
7445
7446         id_copy = strdup(id);
7447         if (!id_copy)
7448                 return -ENOMEM;
7449
7450         n = new(ExecSharedRuntime, 1);
7451         if (!n)
7452                 return -ENOMEM;
7453
7454         *n = (ExecSharedRuntime) {
7455                 .id = TAKE_PTR(id_copy),
7456                 .netns_storage_socket = PIPE_EBADF,
7457                 .ipcns_storage_socket = PIPE_EBADF,
7458         };
7459
7460         *ret = n;
7461         return 0;
7462 }
7463
7464 static int exec_shared_runtime_add(
7465                 Manager *m,
7466                 const char *id,
7467                 char **tmp_dir,
7468                 char **var_tmp_dir,
7469                 int netns_storage_socket[2],
7470                 int ipcns_storage_socket[2],
7471                 ExecSharedRuntime **ret) {
7472
7473         _cleanup_(exec_shared_runtime_freep) ExecSharedRuntime *rt = NULL;
7474         int r;
7475
7476         assert(m);
7477         assert(id);
7478
7479         /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
7480
7481         r = exec_shared_runtime_allocate(&rt, id);
7482         if (r < 0)
7483                 return r;
7484
7485         r = hashmap_ensure_put(&m->exec_shared_runtime_by_id, &string_hash_ops, rt->id, rt);
7486         if (r < 0)
7487                 return r;
7488
7489         assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
7490         rt->tmp_dir = TAKE_PTR(*tmp_dir);
7491         rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
7492
7493         if (netns_storage_socket) {
7494                 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
7495                 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
7496         }
7497
7498         if (ipcns_storage_socket) {
7499                 rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
7500                 rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
7501         }
7502
7503         rt->manager = m;
7504
7505         if (ret)
7506                 *ret = rt;
7507         /* do not remove created ExecSharedRuntime object when the operation succeeds. */
7508         TAKE_PTR(rt);
7509         return 0;
7510 }
7511
7512 static int exec_shared_runtime_make(
7513                 Manager *m,
7514                 const ExecContext *c,
7515                 const char *id,
7516                 ExecSharedRuntime **ret) {
7517
7518         _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
7519         _cleanup_close_pair_ int netns_storage_socket[2] = PIPE_EBADF, ipcns_storage_socket[2] = PIPE_EBADF;
7520         int r;
7521
7522         assert(m);
7523         assert(c);
7524         assert(id);
7525
7526         /* It is not necessary to create ExecSharedRuntime object. */
7527         if (!exec_needs_network_namespace(c) && !exec_needs_ipc_namespace(c) && !c->private_tmp) {
7528                 *ret = NULL;
7529                 return 0;
7530         }
7531
7532         if (c->private_tmp &&
7533             !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
7534               (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
7535                prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
7536                 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
7537                 if (r < 0)
7538                         return r;
7539         }
7540
7541         if (exec_needs_network_namespace(c)) {
7542                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
7543                         return -errno;
7544         }
7545
7546         if (exec_needs_ipc_namespace(c)) {
7547                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
7548                         return -errno;
7549         }
7550
7551         r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
7552         if (r < 0)
7553                 return r;
7554
7555         return 1;
7556 }
7557
7558 int exec_shared_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecSharedRuntime **ret) {
7559         ExecSharedRuntime *rt;
7560         int r;
7561
7562         assert(m);
7563         assert(id);
7564         assert(ret);
7565
7566         rt = hashmap_get(m->exec_shared_runtime_by_id, id);
7567         if (rt)
7568                 /* We already have an ExecSharedRuntime object, let's increase the ref count and reuse it */
7569                 goto ref;
7570
7571         if (!create) {
7572                 *ret = NULL;
7573                 return 0;
7574         }
7575
7576         /* If not found, then create a new object. */
7577         r = exec_shared_runtime_make(m, c, id, &rt);
7578         if (r < 0)
7579                 return r;
7580         if (r == 0) {
7581                 /* When r == 0, it is not necessary to create ExecSharedRuntime object. */
7582                 *ret = NULL;
7583                 return 0;
7584         }
7585
7586 ref:
7587         /* increment reference counter. */
7588         rt->n_ref++;
7589         *ret = rt;
7590         return 1;
7591 }
7592
7593 int exec_shared_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
7594         ExecSharedRuntime *rt;
7595
7596         assert(m);
7597         assert(f);
7598         assert(fds);
7599
7600         HASHMAP_FOREACH(rt, m->exec_shared_runtime_by_id) {
7601                 fprintf(f, "exec-runtime=%s", rt->id);
7602
7603                 if (rt->tmp_dir)
7604                         fprintf(f, " tmp-dir=%s", rt->tmp_dir);
7605
7606                 if (rt->var_tmp_dir)
7607                         fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
7608
7609                 if (rt->netns_storage_socket[0] >= 0) {
7610                         int copy;
7611
7612                         copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
7613                         if (copy < 0)
7614                                 return copy;
7615
7616                         fprintf(f, " netns-socket-0=%i", copy);
7617                 }
7618
7619                 if (rt->netns_storage_socket[1] >= 0) {
7620                         int copy;
7621
7622                         copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
7623                         if (copy < 0)
7624                                 return copy;
7625
7626                         fprintf(f, " netns-socket-1=%i", copy);
7627                 }
7628
7629                 if (rt->ipcns_storage_socket[0] >= 0) {
7630                         int copy;
7631
7632                         copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
7633                         if (copy < 0)
7634                                 return copy;
7635
7636                         fprintf(f, " ipcns-socket-0=%i", copy);
7637                 }
7638
7639                 if (rt->ipcns_storage_socket[1] >= 0) {
7640                         int copy;
7641
7642                         copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
7643                         if (copy < 0)
7644                                 return copy;
7645
7646                         fprintf(f, " ipcns-socket-1=%i", copy);
7647                 }
7648
7649                 fputc('\n', f);
7650         }
7651
7652         return 0;
7653 }
7654
7655 int exec_shared_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
7656         _cleanup_(exec_shared_runtime_freep) ExecSharedRuntime *rt_create = NULL;
7657         ExecSharedRuntime *rt;
7658         int r;
7659
7660         /* This is for the migration from old (v237 or earlier) deserialization text.
7661          * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
7662          * Even if the ExecSharedRuntime object originally created by the other unit, we cannot judge
7663          * so or not from the serialized text, then we always creates a new object owned by this. */
7664
7665         assert(u);
7666         assert(key);
7667         assert(value);
7668
7669         /* Manager manages ExecSharedRuntime objects by the unit id.
7670          * So, we omit the serialized text when the unit does not have id (yet?)... */
7671         if (isempty(u->id)) {
7672                 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
7673                 return 0;
7674         }
7675
7676         if (hashmap_ensure_allocated(&u->manager->exec_shared_runtime_by_id, &string_hash_ops) < 0)
7677                 return log_oom();
7678
7679         rt = hashmap_get(u->manager->exec_shared_runtime_by_id, u->id);
7680         if (!rt) {
7681                 if (exec_shared_runtime_allocate(&rt_create, u->id) < 0)
7682                         return log_oom();
7683
7684                 rt = rt_create;
7685         }
7686
7687         if (streq(key, "tmp-dir")) {
7688                 if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
7689                         return -ENOMEM;
7690
7691         } else if (streq(key, "var-tmp-dir")) {
7692                 if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
7693                         return -ENOMEM;
7694
7695         } else if (streq(key, "netns-socket-0")) {
7696                 int fd;
7697
7698                 if ((fd = parse_fd(value)) < 0 || !fdset_contains(fds, fd)) {
7699                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
7700                         return 0;
7701                 }
7702
7703                 safe_close(rt->netns_storage_socket[0]);
7704                 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
7705
7706         } else if (streq(key, "netns-socket-1")) {
7707                 int fd;
7708
7709                 if ((fd = parse_fd(value)) < 0 || !fdset_contains(fds, fd)) {
7710                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
7711                         return 0;
7712                 }
7713
7714                 safe_close(rt->netns_storage_socket[1]);
7715                 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
7716
7717         } else
7718                 return 0;
7719
7720         /* If the object is newly created, then put it to the hashmap which manages ExecSharedRuntime objects. */
7721         if (rt_create) {
7722                 r = hashmap_put(u->manager->exec_shared_runtime_by_id, rt_create->id, rt_create);
7723                 if (r < 0) {
7724                         log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
7725                         return 0;
7726                 }
7727
7728                 rt_create->manager = u->manager;
7729
7730                 /* Avoid cleanup */
7731                 TAKE_PTR(rt_create);
7732         }
7733
7734         return 1;
7735 }
7736
7737 int exec_shared_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
7738         _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
7739         char *id = NULL;
7740         int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
7741         const char *p, *v = ASSERT_PTR(value);
7742         size_t n;
7743
7744         assert(m);
7745         assert(fds);
7746
7747         n = strcspn(v, " ");
7748         id = strndupa_safe(v, n);
7749         if (v[n] != ' ')
7750                 goto finalize;
7751         p = v + n + 1;
7752
7753         v = startswith(p, "tmp-dir=");
7754         if (v) {
7755                 n = strcspn(v, " ");
7756                 tmp_dir = strndup(v, n);
7757                 if (!tmp_dir)
7758                         return log_oom();
7759                 if (v[n] != ' ')
7760                         goto finalize;
7761                 p = v + n + 1;
7762         }
7763
7764         v = startswith(p, "var-tmp-dir=");
7765         if (v) {
7766                 n = strcspn(v, " ");
7767                 var_tmp_dir = strndup(v, n);
7768                 if (!var_tmp_dir)
7769                         return log_oom();
7770                 if (v[n] != ' ')
7771                         goto finalize;
7772                 p = v + n + 1;
7773         }
7774
7775         v = startswith(p, "netns-socket-0=");
7776         if (v) {
7777                 char *buf;
7778
7779                 n = strcspn(v, " ");
7780                 buf = strndupa_safe(v, n);
7781
7782                 netns_fdpair[0] = parse_fd(buf);
7783                 if (netns_fdpair[0] < 0)
7784                         return log_debug_errno(netns_fdpair[0], "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
7785                 if (!fdset_contains(fds, netns_fdpair[0]))
7786                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7787                                                "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair[0]);
7788                 netns_fdpair[0] = fdset_remove(fds, netns_fdpair[0]);
7789                 if (v[n] != ' ')
7790                         goto finalize;
7791                 p = v + n + 1;
7792         }
7793
7794         v = startswith(p, "netns-socket-1=");
7795         if (v) {
7796                 char *buf;
7797
7798                 n = strcspn(v, " ");
7799                 buf = strndupa_safe(v, n);
7800
7801                 netns_fdpair[1] = parse_fd(buf);
7802                 if (netns_fdpair[1] < 0)
7803                         return log_debug_errno(netns_fdpair[1], "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
7804                 if (!fdset_contains(fds, netns_fdpair[1]))
7805                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7806                                                "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair[1]);
7807                 netns_fdpair[1] = fdset_remove(fds, netns_fdpair[1]);
7808                 if (v[n] != ' ')
7809                         goto finalize;
7810                 p = v + n + 1;
7811         }
7812
7813         v = startswith(p, "ipcns-socket-0=");
7814         if (v) {
7815                 char *buf;
7816
7817                 n = strcspn(v, " ");
7818                 buf = strndupa_safe(v, n);
7819
7820                 ipcns_fdpair[0] = parse_fd(buf);
7821                 if (ipcns_fdpair[0] < 0)
7822                         return log_debug_errno(ipcns_fdpair[0], "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf);
7823                 if (!fdset_contains(fds, ipcns_fdpair[0]))
7824                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7825                                                "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair[0]);
7826                 ipcns_fdpair[0] = fdset_remove(fds, ipcns_fdpair[0]);
7827                 if (v[n] != ' ')
7828                         goto finalize;
7829                 p = v + n + 1;
7830         }
7831
7832         v = startswith(p, "ipcns-socket-1=");
7833         if (v) {
7834                 char *buf;
7835
7836                 n = strcspn(v, " ");
7837                 buf = strndupa_safe(v, n);
7838
7839                 ipcns_fdpair[1] = parse_fd(buf);
7840                 if (ipcns_fdpair[1] < 0)
7841                         return log_debug_errno(ipcns_fdpair[1], "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf);
7842                 if (!fdset_contains(fds, ipcns_fdpair[1]))
7843                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7844                                                "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair[1]);
7845                 ipcns_fdpair[1] = fdset_remove(fds, ipcns_fdpair[1]);
7846         }
7847
7848 finalize:
7849         r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
7850         if (r < 0)
7851                 return log_debug_errno(r, "Failed to add exec-runtime: %m");
7852         return 0;
7853 }
7854
7855 void exec_shared_runtime_vacuum(Manager *m) {
7856         ExecSharedRuntime *rt;
7857
7858         assert(m);
7859
7860         /* Free unreferenced ExecSharedRuntime objects. This is used after manager deserialization process. */
7861
7862         HASHMAP_FOREACH(rt, m->exec_shared_runtime_by_id) {
7863                 if (rt->n_ref > 0)
7864                         continue;
7865
7866                 (void) exec_shared_runtime_free(rt);
7867         }
7868 }
7869
7870 int exec_runtime_make(
7871                 const Unit *unit,
7872                 const ExecContext *context,
7873                 ExecSharedRuntime *shared,
7874                 DynamicCreds *creds,
7875                 ExecRuntime **ret) {
7876         _cleanup_close_pair_ int ephemeral_storage_socket[2] = PIPE_EBADF;
7877         _cleanup_free_ char *ephemeral = NULL;
7878         _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
7879         int r;
7880
7881         assert(unit);
7882         assert(context);
7883         assert(ret);
7884
7885         if (!shared && !creds && !exec_needs_ephemeral(context)) {
7886                 *ret = NULL;
7887                 return 0;
7888         }
7889
7890         if (exec_needs_ephemeral(context)) {
7891                 r = mkdir_p("/var/lib/systemd/ephemeral-trees", 0755);
7892                 if (r < 0)
7893                         return r;
7894
7895                 r = tempfn_random_child("/var/lib/systemd/ephemeral-trees", unit->id, &ephemeral);
7896                 if (r < 0)
7897                         return r;
7898
7899                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ephemeral_storage_socket) < 0)
7900                         return -errno;
7901         }
7902
7903         rt = new(ExecRuntime, 1);
7904         if (!rt)
7905                 return -ENOMEM;
7906
7907         *rt = (ExecRuntime) {
7908                 .shared = shared,
7909                 .dynamic_creds = creds,
7910                 .ephemeral_copy = TAKE_PTR(ephemeral),
7911                 .ephemeral_storage_socket[0] = TAKE_FD(ephemeral_storage_socket[0]),
7912                 .ephemeral_storage_socket[1] = TAKE_FD(ephemeral_storage_socket[1]),
7913         };
7914
7915         *ret = TAKE_PTR(rt);
7916         return 1;
7917 }
7918
7919 ExecRuntime* exec_runtime_free(ExecRuntime *rt) {
7920         if (!rt)
7921                 return NULL;
7922
7923         exec_shared_runtime_unref(rt->shared);
7924         dynamic_creds_unref(rt->dynamic_creds);
7925
7926         rt->ephemeral_copy = destroy_tree(rt->ephemeral_copy);
7927
7928         safe_close_pair(rt->ephemeral_storage_socket);
7929         return mfree(rt);
7930 }
7931
7932 ExecRuntime* exec_runtime_destroy(ExecRuntime *rt) {
7933         if (!rt)
7934                 return NULL;
7935
7936         rt->shared = exec_shared_runtime_destroy(rt->shared);
7937         rt->dynamic_creds = dynamic_creds_destroy(rt->dynamic_creds);
7938         return exec_runtime_free(rt);
7939 }
7940
7941 void exec_params_clear(ExecParameters *p) {
7942         if (!p)
7943                 return;
7944
7945         p->environment = strv_free(p->environment);
7946         p->fd_names = strv_free(p->fd_names);
7947         p->fds = mfree(p->fds);
7948         p->exec_fd = safe_close(p->exec_fd);
7949 }
7950
7951 ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) {
7952         if (!sc)
7953                 return NULL;
7954
7955         free(sc->id);
7956         free(sc->data);
7957         return mfree(sc);
7958 }
7959
7960 ExecLoadCredential *exec_load_credential_free(ExecLoadCredential *lc) {
7961         if (!lc)
7962                 return NULL;
7963
7964         free(lc->id);
7965         free(lc->path);
7966         return mfree(lc);
7967 }
7968
7969 void exec_directory_done(ExecDirectory *d) {
7970         if (!d)
7971                 return;
7972
7973         for (size_t i = 0; i < d->n_items; i++) {
7974                 free(d->items[i].path);
7975                 strv_free(d->items[i].symlinks);
7976         }
7977
7978         d->items = mfree(d->items);
7979         d->n_items = 0;
7980         d->mode = 0755;
7981 }
7982
7983 static ExecDirectoryItem *exec_directory_find(ExecDirectory *d, const char *path) {
7984         assert(d);
7985         assert(path);
7986
7987         for (size_t i = 0; i < d->n_items; i++)
7988                 if (path_equal(d->items[i].path, path))
7989                         return &d->items[i];
7990
7991         return NULL;
7992 }
7993
7994 int exec_directory_add(ExecDirectory *d, const char *path, const char *symlink) {
7995         _cleanup_strv_free_ char **s = NULL;
7996         _cleanup_free_ char *p = NULL;
7997         ExecDirectoryItem *existing;
7998         int r;
7999
8000         assert(d);
8001         assert(path);
8002
8003         existing = exec_directory_find(d, path);
8004         if (existing) {
8005                 r = strv_extend(&existing->symlinks, symlink);
8006                 if (r < 0)
8007                         return r;
8008
8009                 return 0; /* existing item is updated */
8010         }
8011
8012         p = strdup(path);
8013         if (!p)
8014                 return -ENOMEM;
8015
8016         if (symlink) {
8017                 s = strv_new(symlink);
8018                 if (!s)
8019                         return -ENOMEM;
8020         }
8021
8022         if (!GREEDY_REALLOC(d->items, d->n_items + 1))
8023                 return -ENOMEM;
8024
8025         d->items[d->n_items++] = (ExecDirectoryItem) {
8026                 .path = TAKE_PTR(p),
8027                 .symlinks = TAKE_PTR(s),
8028         };
8029
8030         return 1; /* new item is added */
8031 }
8032
8033 static int exec_directory_item_compare_func(const ExecDirectoryItem *a, const ExecDirectoryItem *b) {
8034         assert(a);
8035         assert(b);
8036
8037         return path_compare(a->path, b->path);
8038 }
8039
8040 void exec_directory_sort(ExecDirectory *d) {
8041         assert(d);
8042
8043         /* Sort the exec directories to make always parent directories processed at first in
8044          * setup_exec_directory(), e.g., even if StateDirectory=foo/bar foo, we need to create foo at first,
8045          * then foo/bar. Also, set .only_create flag if one of the parent directories is contained in the
8046          * list. See also comments in setup_exec_directory() and issue #24783. */
8047
8048         if (d->n_items <= 1)
8049                 return;
8050
8051         typesafe_qsort(d->items, d->n_items, exec_directory_item_compare_func);
8052
8053         for (size_t i = 1; i < d->n_items; i++)
8054                 for (size_t j = 0; j < i; j++)
8055                         if (path_startswith(d->items[i].path, d->items[j].path)) {
8056                                 d->items[i].only_create = true;
8057                                 break;
8058                         }
8059 }
8060
8061 ExecCleanMask exec_clean_mask_from_string(const char *s) {
8062         ExecDirectoryType t;
8063
8064         assert(s);
8065
8066         if (streq(s, "all"))
8067                 return EXEC_CLEAN_ALL;
8068         if (streq(s, "fdstore"))
8069                 return EXEC_CLEAN_FDSTORE;
8070
8071         t = exec_resource_type_from_string(s);
8072         if (t < 0)
8073                 return (ExecCleanMask) t;
8074
8075         return 1U << t;
8076 }
8077
8078 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_set_credential_hash_ops, char, string_hash_func, string_compare_func, ExecSetCredential, exec_set_credential_free);
8079 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_load_credential_hash_ops, char, string_hash_func, string_compare_func, ExecLoadCredential, exec_load_credential_free);
8080
8081 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
8082         [EXEC_INPUT_NULL] = "null",
8083         [EXEC_INPUT_TTY] = "tty",
8084         [EXEC_INPUT_TTY_FORCE] = "tty-force",
8085         [EXEC_INPUT_TTY_FAIL] = "tty-fail",
8086         [EXEC_INPUT_SOCKET] = "socket",
8087         [EXEC_INPUT_NAMED_FD] = "fd",
8088         [EXEC_INPUT_DATA] = "data",
8089         [EXEC_INPUT_FILE] = "file",
8090 };
8091
8092 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
8093
8094 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
8095         [EXEC_OUTPUT_INHERIT] = "inherit",
8096         [EXEC_OUTPUT_NULL] = "null",
8097         [EXEC_OUTPUT_TTY] = "tty",
8098         [EXEC_OUTPUT_KMSG] = "kmsg",
8099         [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
8100         [EXEC_OUTPUT_JOURNAL] = "journal",
8101         [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
8102         [EXEC_OUTPUT_SOCKET] = "socket",
8103         [EXEC_OUTPUT_NAMED_FD] = "fd",
8104         [EXEC_OUTPUT_FILE] = "file",
8105         [EXEC_OUTPUT_FILE_APPEND] = "append",
8106         [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
8107 };
8108
8109 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
8110
8111 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
8112         [EXEC_UTMP_INIT] = "init",
8113         [EXEC_UTMP_LOGIN] = "login",
8114         [EXEC_UTMP_USER] = "user",
8115 };
8116
8117 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
8118
8119 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
8120         [EXEC_PRESERVE_NO] = "no",
8121         [EXEC_PRESERVE_YES] = "yes",
8122         [EXEC_PRESERVE_RESTART] = "restart",
8123 };
8124
8125 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
8126
8127 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
8128 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
8129         [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
8130         [EXEC_DIRECTORY_STATE] = "StateDirectory",
8131         [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
8132         [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
8133         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
8134 };
8135
8136 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
8137
8138 /* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
8139 static const char* const exec_directory_type_symlink_table[_EXEC_DIRECTORY_TYPE_MAX] = {
8140         [EXEC_DIRECTORY_RUNTIME]       = "RuntimeDirectorySymlink",
8141         [EXEC_DIRECTORY_STATE]         = "StateDirectorySymlink",
8142         [EXEC_DIRECTORY_CACHE]         = "CacheDirectorySymlink",
8143         [EXEC_DIRECTORY_LOGS]          = "LogsDirectorySymlink",
8144         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectorySymlink",
8145 };
8146
8147 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink, ExecDirectoryType);
8148
8149 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
8150  * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
8151  * directories, specifically .timer units with their timestamp touch file. */
8152 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
8153         [EXEC_DIRECTORY_RUNTIME] = "runtime",
8154         [EXEC_DIRECTORY_STATE] = "state",
8155         [EXEC_DIRECTORY_CACHE] = "cache",
8156         [EXEC_DIRECTORY_LOGS] = "logs",
8157         [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
8158 };
8159
8160 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
8161
8162 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
8163  * the service payload in. */
8164 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
8165         [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
8166         [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
8167         [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
8168         [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
8169         [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
8170 };
8171
8172 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
8173
8174 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
8175         [EXEC_KEYRING_INHERIT] = "inherit",
8176         [EXEC_KEYRING_PRIVATE] = "private",
8177         [EXEC_KEYRING_SHARED] = "shared",
8178 };
8179
8180 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);