src/core/execute.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <errno.h>
   4 #include <fcntl.h>
   5 #include <poll.h>
   6 #include <sys/eventfd.h>
   7 #include <sys/ioctl.h>
   8 #include <sys/mman.h>
   9 #include <sys/mount.h>
  10 #include <sys/personality.h>
  11 #include <sys/prctl.h>
  12 #include <sys/shm.h>
  13 #include <sys/types.h>
  14 #include <sys/un.h>
  15 #include <unistd.h>
  16 #include <utmpx.h>
  17
  18 #if HAVE_PAM
  19 #include <security/pam_appl.h>
  20 #endif
  21
  22 #if HAVE_SELINUX
  23 #include <selinux/selinux.h>
  24 #endif
  25
  26 #if HAVE_SECCOMP
  27 #include <seccomp.h>
  28 #endif
  29
  30 #if HAVE_APPARMOR
  31 #include <sys/apparmor.h>
  32 #endif
  33
  34 #include "sd-messages.h"
  35
  36 #include "acl-util.h"
  37 #include "af-list.h"
  38 #include "alloc-util.h"
  39 #if HAVE_APPARMOR
  40 #include "apparmor-util.h"
  41 #endif
  42 #include "argv-util.h"
  43 #include "async.h"
  44 #include "barrier.h"
  45 #include "bpf-lsm.h"
  46 #include "cap-list.h"
  47 #include "capability-util.h"
  48 #include "cgroup-setup.h"
  49 #include "chase-symlinks.h"
  50 #include "chown-recursive.h"
  51 #include "constants.h"
  52 #include "cpu-set-util.h"
  53 #include "creds-util.h"
  54 #include "data-fd-util.h"
  55 #include "env-file.h"
  56 #include "env-util.h"
  57 #include "errno-list.h"
  58 #include "escape.h"
  59 #include "execute.h"
  60 #include "exit-status.h"
  61 #include "fd-util.h"
  62 #include "fileio.h"
  63 #include "format-util.h"
  64 #include "glob-util.h"
  65 #include "hexdecoct.h"
  66 #include "io-util.h"
  67 #include "ioprio-util.h"
  68 #include "label.h"
  69 #include "log.h"
  70 #include "macro.h"
  71 #include "manager.h"
  72 #include "manager-dump.h"
  73 #include "memory-util.h"
  74 #include "missing_fs.h"
  75 #include "missing_ioprio.h"
  76 #include "mkdir-label.h"
  77 #include "mount-util.h"
  78 #include "mountpoint-util.h"
  79 #include "namespace.h"
  80 #include "parse-util.h"
  81 #include "path-util.h"
  82 #include "process-util.h"
  83 #include "random-util.h"
  84 #include "recurse-dir.h"
  85 #include "rlimit-util.h"
  86 #include "rm-rf.h"
  87 #if HAVE_SECCOMP
  88 #include "seccomp-util.h"
  89 #endif
  90 #include "securebits-util.h"
  91 #include "selinux-util.h"
  92 #include "signal-util.h"
  93 #include "smack-util.h"
  94 #include "socket-util.h"
  95 #include "sort-util.h"
  96 #include "special.h"
  97 #include "stat-util.h"
  98 #include "string-table.h"
  99 #include "string-util.h"
 100 #include "strv.h"
 101 #include "syslog-util.h"
 102 #include "terminal-util.h"
 103 #include "tmpfile-util.h"
 104 #include "umask-util.h"
 105 #include "unit-serialize.h"
 106 #include "user-util.h"
 107 #include "utmp-wtmp.h"
 108
 109 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
 110 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
 111
 112 #define SNDBUF_SIZE (8*1024*1024)
 113
 114 static int shift_fds(int fds[], size_t n_fds) {
 115         if (n_fds <= 0)
 116                 return 0;
 117
 118         /* Modifies the fds array! (sorts it) */
 119
 120         assert(fds);
 121
 122         for (int start = 0;;) {
 123                 int restart_from = -1;
 124
 125                 for (int i = start; i < (int) n_fds; i++) {
 126                         int nfd;
 127
 128                         /* Already at right index? */
 129                         if (fds[i] == i+3)
 130                                 continue;
 131
 132                         nfd = fcntl(fds[i], F_DUPFD, i + 3);
 133                         if (nfd < 0)
 134                                 return -errno;
 135
 136                         safe_close(fds[i]);
 137                         fds[i] = nfd;
 138
 139                         /* Hmm, the fd we wanted isn't free? Then
 140                          * let's remember that and try again from here */
 141                         if (nfd != i+3 && restart_from < 0)
 142                                 restart_from = i;
 143                 }
 144
 145                 if (restart_from < 0)
 146                         break;
 147
 148                 start = restart_from;
 149         }
 150
 151         return 0;
 152 }
 153
 154 static int flags_fds(
 155                 const int fds[],
 156                 size_t n_socket_fds,
 157                 size_t n_fds,
 158                 bool nonblock) {
 159
 160         int r;
 161
 162         if (n_fds <= 0)
 163                 return 0;
 164
 165         assert(fds);
 166
 167         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
 168          * O_NONBLOCK only applies to socket activation though. */
 169
 170         for (size_t i = 0; i < n_fds; i++) {
 171
 172                 if (i < n_socket_fds) {
 173                         r = fd_nonblock(fds[i], nonblock);
 174                         if (r < 0)
 175                                 return r;
 176                 }
 177
 178                 /* We unconditionally drop FD_CLOEXEC from the fds,
 179                  * since after all we want to pass these fds to our
 180                  * children */
 181
 182                 r = fd_cloexec(fds[i], false);
 183                 if (r < 0)
 184                         return r;
 185         }
 186
 187         return 0;
 188 }
 189
 190 static const char *exec_context_tty_path(const ExecContext *context) {
 191         assert(context);
 192
 193         if (context->stdio_as_fds)
 194                 return NULL;
 195
 196         if (context->tty_path)
 197                 return context->tty_path;
 198
 199         return "/dev/console";
 200 }
 201
 202 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
 203         const char *path;
 204
 205         assert(context);
 206
 207         path = exec_context_tty_path(context);
 208
 209         if (context->tty_vhangup) {
 210                 if (p && p->stdin_fd >= 0)
 211                         (void) terminal_vhangup_fd(p->stdin_fd);
 212                 else if (path)
 213                         (void) terminal_vhangup(path);
 214         }
 215
 216         if (context->tty_reset) {
 217                 if (p && p->stdin_fd >= 0)
 218                         (void) reset_terminal_fd(p->stdin_fd, true);
 219                 else if (path)
 220                         (void) reset_terminal(path);
 221         }
 222
 223         if (p && p->stdin_fd >= 0)
 224                 (void) terminal_set_size_fd(p->stdin_fd, path, context->tty_rows, context->tty_cols);
 225
 226         if (context->tty_vt_disallocate && path)
 227                 (void) vt_disallocate(path);
 228 }
 229
 230 static bool is_terminal_input(ExecInput i) {
 231         return IN_SET(i,
 232                       EXEC_INPUT_TTY,
 233                       EXEC_INPUT_TTY_FORCE,
 234                       EXEC_INPUT_TTY_FAIL);
 235 }
 236
 237 static bool is_terminal_output(ExecOutput o) {
 238         return IN_SET(o,
 239                       EXEC_OUTPUT_TTY,
 240                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 241                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 242 }
 243
 244 static bool is_kmsg_output(ExecOutput o) {
 245         return IN_SET(o,
 246                       EXEC_OUTPUT_KMSG,
 247                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 248 }
 249
 250 static bool exec_context_needs_term(const ExecContext *c) {
 251         assert(c);
 252
 253         /* Return true if the execution context suggests we should set $TERM to something useful. */
 254
 255         if (is_terminal_input(c->std_input))
 256                 return true;
 257
 258         if (is_terminal_output(c->std_output))
 259                 return true;
 260
 261         if (is_terminal_output(c->std_error))
 262                 return true;
 263
 264         return !!c->tty_path;
 265 }
 266
 267 static int open_null_as(int flags, int nfd) {
 268         int fd;
 269
 270         assert(nfd >= 0);
 271
 272         fd = open("/dev/null", flags|O_NOCTTY);
 273         if (fd < 0)
 274                 return -errno;
 275
 276         return move_fd(fd, nfd, false);
 277 }
 278
 279 static int connect_journal_socket(
 280                 int fd,
 281                 const char *log_namespace,
 282                 uid_t uid,
 283                 gid_t gid) {
 284
 285         uid_t olduid = UID_INVALID;
 286         gid_t oldgid = GID_INVALID;
 287         const char *j;
 288         int r;
 289
 290         j = log_namespace ?
 291                 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
 292                 "/run/systemd/journal/stdout";
 293
 294         if (gid_is_valid(gid)) {
 295                 oldgid = getgid();
 296
 297                 if (setegid(gid) < 0)
 298                         return -errno;
 299         }
 300
 301         if (uid_is_valid(uid)) {
 302                 olduid = getuid();
 303
 304                 if (seteuid(uid) < 0) {
 305                         r = -errno;
 306                         goto restore_gid;
 307                 }
 308         }
 309
 310         r = connect_unix_path(fd, AT_FDCWD, j);
 311
 312         /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
 313            an LSM interferes. */
 314
 315         if (uid_is_valid(uid))
 316                 (void) seteuid(olduid);
 317
 318  restore_gid:
 319         if (gid_is_valid(gid))
 320                 (void) setegid(oldgid);
 321
 322         return r;
 323 }
 324
 325 static int connect_logger_as(
 326                 const Unit *unit,
 327                 const ExecContext *context,
 328                 const ExecParameters *params,
 329                 ExecOutput output,
 330                 const char *ident,
 331                 int nfd,
 332                 uid_t uid,
 333                 gid_t gid) {
 334
 335         _cleanup_close_ int fd = -EBADF;
 336         int r;
 337
 338         assert(context);
 339         assert(params);
 340         assert(output < _EXEC_OUTPUT_MAX);
 341         assert(ident);
 342         assert(nfd >= 0);
 343
 344         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 345         if (fd < 0)
 346                 return -errno;
 347
 348         r = connect_journal_socket(fd, context->log_namespace, uid, gid);
 349         if (r < 0)
 350                 return r;
 351
 352         if (shutdown(fd, SHUT_RD) < 0)
 353                 return -errno;
 354
 355         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 356
 357         if (dprintf(fd,
 358                 "%s\n"
 359                 "%s\n"
 360                 "%i\n"
 361                 "%i\n"
 362                 "%i\n"
 363                 "%i\n"
 364                 "%i\n",
 365                 context->syslog_identifier ?: ident,
 366                 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
 367                 context->syslog_priority,
 368                 !!context->syslog_level_prefix,
 369                 false,
 370                 is_kmsg_output(output),
 371                 is_terminal_output(output)) < 0)
 372                 return -errno;
 373
 374         return move_fd(TAKE_FD(fd), nfd, false);
 375 }
 376
 377 static int open_terminal_as(const char *path, int flags, int nfd) {
 378         int fd;
 379
 380         assert(path);
 381         assert(nfd >= 0);
 382
 383         fd = open_terminal(path, flags | O_NOCTTY);
 384         if (fd < 0)
 385                 return fd;
 386
 387         return move_fd(fd, nfd, false);
 388 }
 389
 390 static int acquire_path(const char *path, int flags, mode_t mode) {
 391         _cleanup_close_ int fd = -EBADF;
 392         int r;
 393
 394         assert(path);
 395
 396         if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
 397                 flags |= O_CREAT;
 398
 399         fd = open(path, flags|O_NOCTTY, mode);
 400         if (fd >= 0)
 401                 return TAKE_FD(fd);
 402
 403         if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
 404                 return -errno;
 405
 406         /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
 407
 408         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 409         if (fd < 0)
 410                 return -errno;
 411
 412         r = connect_unix_path(fd, AT_FDCWD, path);
 413         if (IN_SET(r, -ENOTSOCK, -EINVAL))
 414                 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
 415                  * wasn't an AF_UNIX socket after all */
 416                 return -ENXIO;
 417         if (r < 0)
 418                 return r;
 419
 420         if ((flags & O_ACCMODE) == O_RDONLY)
 421                 r = shutdown(fd, SHUT_WR);
 422         else if ((flags & O_ACCMODE) == O_WRONLY)
 423                 r = shutdown(fd, SHUT_RD);
 424         else
 425                 r = 0;
 426         if (r < 0)
 427                 return -errno;
 428
 429         return TAKE_FD(fd);
 430 }
 431
 432 static int fixup_input(
 433                 const ExecContext *context,
 434                 int socket_fd,
 435                 bool apply_tty_stdin) {
 436
 437         ExecInput std_input;
 438
 439         assert(context);
 440
 441         std_input = context->std_input;
 442
 443         if (is_terminal_input(std_input) && !apply_tty_stdin)
 444                 return EXEC_INPUT_NULL;
 445
 446         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 447                 return EXEC_INPUT_NULL;
 448
 449         if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
 450                 return EXEC_INPUT_NULL;
 451
 452         return std_input;
 453 }
 454
 455 static int fixup_output(ExecOutput output, int socket_fd) {
 456
 457         if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 458                 return EXEC_OUTPUT_INHERIT;
 459
 460         return output;
 461 }
 462
 463 static int setup_input(
 464                 const ExecContext *context,
 465                 const ExecParameters *params,
 466                 int socket_fd,
 467                 const int named_iofds[static 3]) {
 468
 469         ExecInput i;
 470         int r;
 471
 472         assert(context);
 473         assert(params);
 474         assert(named_iofds);
 475
 476         if (params->stdin_fd >= 0) {
 477                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 478                         return -errno;
 479
 480                 /* Try to make this the controlling tty, if it is a tty, and reset it */
 481                 if (isatty(STDIN_FILENO)) {
 482                         (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 483                         (void) reset_terminal_fd(STDIN_FILENO, true);
 484                         (void) terminal_set_size_fd(STDIN_FILENO, NULL, context->tty_rows, context->tty_cols);
 485                 }
 486
 487                 return STDIN_FILENO;
 488         }
 489
 490         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 491
 492         switch (i) {
 493
 494         case EXEC_INPUT_NULL:
 495                 return open_null_as(O_RDONLY, STDIN_FILENO);
 496
 497         case EXEC_INPUT_TTY:
 498         case EXEC_INPUT_TTY_FORCE:
 499         case EXEC_INPUT_TTY_FAIL: {
 500                 int fd;
 501
 502                 fd = acquire_terminal(exec_context_tty_path(context),
 503                                       i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
 504                                       i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
 505                                                                   ACQUIRE_TERMINAL_WAIT,
 506                                       USEC_INFINITY);
 507                 if (fd < 0)
 508                         return fd;
 509
 510                 r = terminal_set_size_fd(fd, exec_context_tty_path(context), context->tty_rows, context->tty_cols);
 511                 if (r < 0)
 512                         return r;
 513
 514                 return move_fd(fd, STDIN_FILENO, false);
 515         }
 516
 517         case EXEC_INPUT_SOCKET:
 518                 assert(socket_fd >= 0);
 519
 520                 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
 521
 522         case EXEC_INPUT_NAMED_FD:
 523                 assert(named_iofds[STDIN_FILENO] >= 0);
 524
 525                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 526                 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
 527
 528         case EXEC_INPUT_DATA: {
 529                 int fd;
 530
 531                 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
 532                 if (fd < 0)
 533                         return fd;
 534
 535                 return move_fd(fd, STDIN_FILENO, false);
 536         }
 537
 538         case EXEC_INPUT_FILE: {
 539                 bool rw;
 540                 int fd;
 541
 542                 assert(context->stdio_file[STDIN_FILENO]);
 543
 544                 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
 545                         (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
 546
 547                 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
 548                 if (fd < 0)
 549                         return fd;
 550
 551                 return move_fd(fd, STDIN_FILENO, false);
 552         }
 553
 554         default:
 555                 assert_not_reached();
 556         }
 557 }
 558
 559 static bool can_inherit_stderr_from_stdout(
 560                 const ExecContext *context,
 561                 ExecOutput o,
 562                 ExecOutput e) {
 563
 564         assert(context);
 565
 566         /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
 567          * stderr fd */
 568
 569         if (e == EXEC_OUTPUT_INHERIT)
 570                 return true;
 571         if (e != o)
 572                 return false;
 573
 574         if (e == EXEC_OUTPUT_NAMED_FD)
 575                 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
 576
 577         if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
 578                 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
 579
 580         return true;
 581 }
 582
 583 static int setup_output(
 584                 const Unit *unit,
 585                 const ExecContext *context,
 586                 const ExecParameters *params,
 587                 int fileno,
 588                 int socket_fd,
 589                 const int named_iofds[static 3],
 590                 const char *ident,
 591                 uid_t uid,
 592                 gid_t gid,
 593                 dev_t *journal_stream_dev,
 594                 ino_t *journal_stream_ino) {
 595
 596         ExecOutput o;
 597         ExecInput i;
 598         int r;
 599
 600         assert(unit);
 601         assert(context);
 602         assert(params);
 603         assert(ident);
 604         assert(journal_stream_dev);
 605         assert(journal_stream_ino);
 606
 607         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 608
 609                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 610                         return -errno;
 611
 612                 return STDOUT_FILENO;
 613         }
 614
 615         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 616                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 617                         return -errno;
 618
 619                 return STDERR_FILENO;
 620         }
 621
 622         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 623         o = fixup_output(context->std_output, socket_fd);
 624
 625         if (fileno == STDERR_FILENO) {
 626                 ExecOutput e;
 627                 e = fixup_output(context->std_error, socket_fd);
 628
 629                 /* This expects the input and output are already set up */
 630
 631                 /* Don't change the stderr file descriptor if we inherit all
 632                  * the way and are not on a tty */
 633                 if (e == EXEC_OUTPUT_INHERIT &&
 634                     o == EXEC_OUTPUT_INHERIT &&
 635                     i == EXEC_INPUT_NULL &&
 636                     !is_terminal_input(context->std_input) &&
 637                     getppid() != 1)
 638                         return fileno;
 639
 640                 /* Duplicate from stdout if possible */
 641                 if (can_inherit_stderr_from_stdout(context, o, e))
 642                         return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
 643
 644                 o = e;
 645
 646         } else if (o == EXEC_OUTPUT_INHERIT) {
 647                 /* If input got downgraded, inherit the original value */
 648                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 649                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 650
 651                 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
 652                 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
 653                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 654
 655                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 656                 if (getppid() != 1)
 657                         return fileno;
 658
 659                 /* We need to open /dev/null here anew, to get the right access mode. */
 660                 return open_null_as(O_WRONLY, fileno);
 661         }
 662
 663         switch (o) {
 664
 665         case EXEC_OUTPUT_NULL:
 666                 return open_null_as(O_WRONLY, fileno);
 667
 668         case EXEC_OUTPUT_TTY:
 669                 if (is_terminal_input(i))
 670                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 671
 672                 /* We don't reset the terminal if this is just about output */
 673                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 674
 675         case EXEC_OUTPUT_KMSG:
 676         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 677         case EXEC_OUTPUT_JOURNAL:
 678         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 679                 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
 680                 if (r < 0) {
 681                         log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m",
 682                                                fileno == STDOUT_FILENO ? "stdout" : "stderr");
 683                         r = open_null_as(O_WRONLY, fileno);
 684                 } else {
 685                         struct stat st;
 686
 687                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 688                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 689                          * services to detect whether they are connected to the journal or not.
 690                          *
 691                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
 692                          * about STDERR as that's usually the best way to do logging. */
 693
 694                         if (fstat(fileno, &st) >= 0 &&
 695                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
 696                                 *journal_stream_dev = st.st_dev;
 697                                 *journal_stream_ino = st.st_ino;
 698                         }
 699                 }
 700                 return r;
 701
 702         case EXEC_OUTPUT_SOCKET:
 703                 assert(socket_fd >= 0);
 704
 705                 return RET_NERRNO(dup2(socket_fd, fileno));
 706
 707         case EXEC_OUTPUT_NAMED_FD:
 708                 assert(named_iofds[fileno] >= 0);
 709
 710                 (void) fd_nonblock(named_iofds[fileno], false);
 711                 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
 712
 713         case EXEC_OUTPUT_FILE:
 714         case EXEC_OUTPUT_FILE_APPEND:
 715         case EXEC_OUTPUT_FILE_TRUNCATE: {
 716                 bool rw;
 717                 int fd, flags;
 718
 719                 assert(context->stdio_file[fileno]);
 720
 721                 rw = context->std_input == EXEC_INPUT_FILE &&
 722                         streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
 723
 724                 if (rw)
 725                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 726
 727                 flags = O_WRONLY;
 728                 if (o == EXEC_OUTPUT_FILE_APPEND)
 729                         flags |= O_APPEND;
 730                 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
 731                         flags |= O_TRUNC;
 732
 733                 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
 734                 if (fd < 0)
 735                         return fd;
 736
 737                 return move_fd(fd, fileno, 0);
 738         }
 739
 740         default:
 741                 assert_not_reached();
 742         }
 743 }
 744
 745 static int chown_terminal(int fd, uid_t uid) {
 746         int r;
 747
 748         assert(fd >= 0);
 749
 750         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 751         if (isatty(fd) < 1) {
 752                 if (IN_SET(errno, EINVAL, ENOTTY))
 753                         return 0; /* not a tty */
 754
 755                 return -errno;
 756         }
 757
 758         /* This might fail. What matters are the results. */
 759         r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
 760         if (r < 0)
 761                 return r;
 762
 763         return 1;
 764 }
 765
 766 static int setup_confirm_stdio(
 767                 const ExecContext *context,
 768                 const char *vc,
 769                 int *ret_saved_stdin,
 770                 int *ret_saved_stdout) {
 771
 772         _cleanup_close_ int fd = -EBADF, saved_stdin = -EBADF, saved_stdout = -EBADF;
 773         int r;
 774
 775         assert(ret_saved_stdin);
 776         assert(ret_saved_stdout);
 777
 778         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 779         if (saved_stdin < 0)
 780                 return -errno;
 781
 782         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
 783         if (saved_stdout < 0)
 784                 return -errno;
 785
 786         fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
 787         if (fd < 0)
 788                 return fd;
 789
 790         r = chown_terminal(fd, getuid());
 791         if (r < 0)
 792                 return r;
 793
 794         r = reset_terminal_fd(fd, true);
 795         if (r < 0)
 796                 return r;
 797
 798         r = terminal_set_size_fd(fd, vc, context->tty_rows, context->tty_cols);
 799         if (r < 0)
 800                 return r;
 801
 802         r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
 803         TAKE_FD(fd);
 804         if (r < 0)
 805                 return r;
 806
 807         *ret_saved_stdin = TAKE_FD(saved_stdin);
 808         *ret_saved_stdout = TAKE_FD(saved_stdout);
 809         return 0;
 810 }
 811
 812 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
 813         assert(err < 0);
 814
 815         if (err == -ETIMEDOUT)
 816                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
 817         else {
 818                 errno = -err;
 819                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
 820         }
 821 }
 822
 823 static void write_confirm_error(int err, const char *vc, const Unit *u) {
 824         _cleanup_close_ int fd = -EBADF;
 825
 826         assert(vc);
 827
 828         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 829         if (fd < 0)
 830                 return;
 831
 832         write_confirm_error_fd(err, fd, u);
 833 }
 834
 835 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 836         int r = 0;
 837
 838         assert(saved_stdin);
 839         assert(saved_stdout);
 840
 841         release_terminal();
 842
 843         if (*saved_stdin >= 0)
 844                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 845                         r = -errno;
 846
 847         if (*saved_stdout >= 0)
 848                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 849                         r = -errno;
 850
 851         *saved_stdin = safe_close(*saved_stdin);
 852         *saved_stdout = safe_close(*saved_stdout);
 853
 854         return r;
 855 }
 856
 857 enum {
 858         CONFIRM_PRETEND_FAILURE = -1,
 859         CONFIRM_PRETEND_SUCCESS =  0,
 860         CONFIRM_EXECUTE = 1,
 861 };
 862
 863 static int ask_for_confirmation(const ExecContext *context, const char *vc, Unit *u, const char *cmdline) {
 864         int saved_stdout = -1, saved_stdin = -1, r;
 865         _cleanup_free_ char *e = NULL;
 866         char c;
 867
 868         /* For any internal errors, assume a positive response. */
 869         r = setup_confirm_stdio(context, vc, &saved_stdin, &saved_stdout);
 870         if (r < 0) {
 871                 write_confirm_error(r, vc, u);
 872                 return CONFIRM_EXECUTE;
 873         }
 874
 875         /* confirm_spawn might have been disabled while we were sleeping. */
 876         if (manager_is_confirm_spawn_disabled(u->manager)) {
 877                 r = 1;
 878                 goto restore_stdio;
 879         }
 880
 881         e = ellipsize(cmdline, 60, 100);
 882         if (!e) {
 883                 log_oom();
 884                 r = CONFIRM_EXECUTE;
 885                 goto restore_stdio;
 886         }
 887
 888         for (;;) {
 889                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 890                 if (r < 0) {
 891                         write_confirm_error_fd(r, STDOUT_FILENO, u);
 892                         r = CONFIRM_EXECUTE;
 893                         goto restore_stdio;
 894                 }
 895
 896                 switch (c) {
 897                 case 'c':
 898                         printf("Resuming normal execution.\n");
 899                         manager_disable_confirm_spawn();
 900                         r = 1;
 901                         break;
 902                 case 'D':
 903                         unit_dump(u, stdout, "  ");
 904                         continue; /* ask again */
 905                 case 'f':
 906                         printf("Failing execution.\n");
 907                         r = CONFIRM_PRETEND_FAILURE;
 908                         break;
 909                 case 'h':
 910                         printf("  c - continue, proceed without asking anymore\n"
 911                                "  D - dump, show the state of the unit\n"
 912                                "  f - fail, don't execute the command and pretend it failed\n"
 913                                "  h - help\n"
 914                                "  i - info, show a short summary of the unit\n"
 915                                "  j - jobs, show jobs that are in progress\n"
 916                                "  s - skip, don't execute the command and pretend it succeeded\n"
 917                                "  y - yes, execute the command\n");
 918                         continue; /* ask again */
 919                 case 'i':
 920                         printf("  Description: %s\n"
 921                                "  Unit:        %s\n"
 922                                "  Command:     %s\n",
 923                                u->id, u->description, cmdline);
 924                         continue; /* ask again */
 925                 case 'j':
 926                         manager_dump_jobs(u->manager, stdout, /* patterns= */ NULL, "  ");
 927                         continue; /* ask again */
 928                 case 'n':
 929                         /* 'n' was removed in favor of 'f'. */
 930                         printf("Didn't understand 'n', did you mean 'f'?\n");
 931                         continue; /* ask again */
 932                 case 's':
 933                         printf("Skipping execution.\n");
 934                         r = CONFIRM_PRETEND_SUCCESS;
 935                         break;
 936                 case 'y':
 937                         r = CONFIRM_EXECUTE;
 938                         break;
 939                 default:
 940                         assert_not_reached();
 941                 }
 942                 break;
 943         }
 944
 945 restore_stdio:
 946         restore_confirm_stdio(&saved_stdin, &saved_stdout);
 947         return r;
 948 }
 949
 950 static int get_fixed_user(const ExecContext *c, const char **user,
 951                           uid_t *uid, gid_t *gid,
 952                           const char **home, const char **shell) {
 953         int r;
 954         const char *name;
 955
 956         assert(c);
 957
 958         if (!c->user)
 959                 return 0;
 960
 961         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
 962          * (i.e. are "/" or "/bin/nologin"). */
 963
 964         name = c->user;
 965         r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
 966         if (r < 0)
 967                 return r;
 968
 969         *user = name;
 970         return 0;
 971 }
 972
 973 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
 974         int r;
 975         const char *name;
 976
 977         assert(c);
 978
 979         if (!c->group)
 980                 return 0;
 981
 982         name = c->group;
 983         r = get_group_creds(&name, gid, 0);
 984         if (r < 0)
 985                 return r;
 986
 987         *group = name;
 988         return 0;
 989 }
 990
 991 static int get_supplementary_groups(const ExecContext *c, const char *user,
 992                                     const char *group, gid_t gid,
 993                                     gid_t **supplementary_gids, int *ngids) {
 994         int r, k = 0;
 995         int ngroups_max;
 996         bool keep_groups = false;
 997         gid_t *groups = NULL;
 998         _cleanup_free_ gid_t *l_gids = NULL;
 999
1000         assert(c);
1001
1002         /*
1003          * If user is given, then lookup GID and supplementary groups list.
1004          * We avoid NSS lookups for gid=0. Also we have to initialize groups
1005          * here and as early as possible so we keep the list of supplementary
1006          * groups of the caller.
1007          */
1008         if (user && gid_is_valid(gid) && gid != 0) {
1009                 /* First step, initialize groups from /etc/groups */
1010                 if (initgroups(user, gid) < 0)
1011                         return -errno;
1012
1013                 keep_groups = true;
1014         }
1015
1016         if (strv_isempty(c->supplementary_groups))
1017                 return 0;
1018
1019         /*
1020          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1021          * be positive, otherwise fail.
1022          */
1023         errno = 0;
1024         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1025         if (ngroups_max <= 0)
1026                 return errno_or_else(EOPNOTSUPP);
1027
1028         l_gids = new(gid_t, ngroups_max);
1029         if (!l_gids)
1030                 return -ENOMEM;
1031
1032         if (keep_groups) {
1033                 /*
1034                  * Lookup the list of groups that the user belongs to, we
1035                  * avoid NSS lookups here too for gid=0.
1036                  */
1037                 k = ngroups_max;
1038                 if (getgrouplist(user, gid, l_gids, &k) < 0)
1039                         return -EINVAL;
1040         } else
1041                 k = 0;
1042
1043         STRV_FOREACH(i, c->supplementary_groups) {
1044                 const char *g;
1045
1046                 if (k >= ngroups_max)
1047                         return -E2BIG;
1048
1049                 g = *i;
1050                 r = get_group_creds(&g, l_gids+k, 0);
1051                 if (r < 0)
1052                         return r;
1053
1054                 k++;
1055         }
1056
1057         /*
1058          * Sets ngids to zero to drop all supplementary groups, happens
1059          * when we are under root and SupplementaryGroups= is empty.
1060          */
1061         if (k == 0) {
1062                 *ngids = 0;
1063                 return 0;
1064         }
1065
1066         /* Otherwise get the final list of supplementary groups */
1067         groups = memdup(l_gids, sizeof(gid_t) * k);
1068         if (!groups)
1069                 return -ENOMEM;
1070
1071         *supplementary_gids = groups;
1072         *ngids = k;
1073
1074         groups = NULL;
1075
1076         return 0;
1077 }
1078
1079 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1080         int r;
1081
1082         /* Handle SupplementaryGroups= if it is not empty */
1083         if (ngids > 0) {
1084                 r = maybe_setgroups(ngids, supplementary_gids);
1085                 if (r < 0)
1086                         return r;
1087         }
1088
1089         if (gid_is_valid(gid)) {
1090                 /* Then set our gids */
1091                 if (setresgid(gid, gid, gid) < 0)
1092                         return -errno;
1093         }
1094
1095         return 0;
1096 }
1097
1098 static int set_securebits(int bits, int mask) {
1099         int current, applied;
1100         current = prctl(PR_GET_SECUREBITS);
1101         if (current < 0)
1102                 return -errno;
1103         /* Clear all securebits defined in mask and set bits */
1104         applied = (current & ~mask) | bits;
1105         if (current == applied)
1106                 return 0;
1107         if (prctl(PR_SET_SECUREBITS, applied) < 0)
1108                 return -errno;
1109         return 1;
1110 }
1111
1112 static int enforce_user(const ExecContext *context, uid_t uid) {
1113         assert(context);
1114         int r;
1115
1116         if (!uid_is_valid(uid))
1117                 return 0;
1118
1119         /* Sets (but doesn't look up) the uid and make sure we keep the
1120          * capabilities while doing so. For setting secure bits the capability CAP_SETPCAP is
1121          * required, so we also need keep-caps in this case.
1122          */
1123
1124         if (context->capability_ambient_set != 0 || context->secure_bits != 0) {
1125
1126                 /* First step: If we need to keep capabilities but
1127                  * drop privileges we need to make sure we keep our
1128                  * caps, while we drop privileges. */
1129                 if (uid != 0) {
1130                         /* Add KEEP_CAPS to the securebits */
1131                         r = set_securebits(1<<SECURE_KEEP_CAPS, 0);
1132                         if (r < 0)
1133                                 return r;
1134                 }
1135         }
1136
1137         /* Second step: actually set the uids */
1138         if (setresuid(uid, uid, uid) < 0)
1139                 return -errno;
1140
1141         /* At this point we should have all necessary capabilities but
1142            are otherwise a normal user. However, the caps might got
1143            corrupted due to the setresuid() so we need clean them up
1144            later. This is done outside of this call. */
1145
1146         return 0;
1147 }
1148
1149 #if HAVE_PAM
1150
1151 static int null_conv(
1152                 int num_msg,
1153                 const struct pam_message **msg,
1154                 struct pam_response **resp,
1155                 void *appdata_ptr) {
1156
1157         /* We don't support conversations */
1158
1159         return PAM_CONV_ERR;
1160 }
1161
1162 #endif
1163
1164 static int setup_pam(
1165                 const char *name,
1166                 const char *user,
1167                 uid_t uid,
1168                 gid_t gid,
1169                 const char *tty,
1170                 char ***env, /* updated on success */
1171                 const int fds[], size_t n_fds) {
1172
1173 #if HAVE_PAM
1174
1175         static const struct pam_conv conv = {
1176                 .conv = null_conv,
1177                 .appdata_ptr = NULL
1178         };
1179
1180         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1181         _cleanup_strv_free_ char **e = NULL;
1182         pam_handle_t *handle = NULL;
1183         sigset_t old_ss;
1184         int pam_code = PAM_SUCCESS, r;
1185         bool close_session = false;
1186         pid_t pam_pid = 0, parent_pid;
1187         int flags = 0;
1188
1189         assert(name);
1190         assert(user);
1191         assert(env);
1192
1193         /* We set up PAM in the parent process, then fork. The child
1194          * will then stay around until killed via PR_GET_PDEATHSIG or
1195          * systemd via the cgroup logic. It will then remove the PAM
1196          * session again. The parent process will exec() the actual
1197          * daemon. We do things this way to ensure that the main PID
1198          * of the daemon is the one we initially fork()ed. */
1199
1200         r = barrier_create(&barrier);
1201         if (r < 0)
1202                 goto fail;
1203
1204         if (log_get_max_level() < LOG_DEBUG)
1205                 flags |= PAM_SILENT;
1206
1207         pam_code = pam_start(name, user, &conv, &handle);
1208         if (pam_code != PAM_SUCCESS) {
1209                 handle = NULL;
1210                 goto fail;
1211         }
1212
1213         if (!tty) {
1214                 _cleanup_free_ char *q = NULL;
1215
1216                 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1217                  * out if that's the case, and read the TTY off it. */
1218
1219                 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1220                         tty = strjoina("/dev/", q);
1221         }
1222
1223         if (tty) {
1224                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1225                 if (pam_code != PAM_SUCCESS)
1226                         goto fail;
1227         }
1228
1229         STRV_FOREACH(nv, *env) {
1230                 pam_code = pam_putenv(handle, *nv);
1231                 if (pam_code != PAM_SUCCESS)
1232                         goto fail;
1233         }
1234
1235         pam_code = pam_acct_mgmt(handle, flags);
1236         if (pam_code != PAM_SUCCESS)
1237                 goto fail;
1238
1239         pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1240         if (pam_code != PAM_SUCCESS)
1241                 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
1242
1243         pam_code = pam_open_session(handle, flags);
1244         if (pam_code != PAM_SUCCESS)
1245                 goto fail;
1246
1247         close_session = true;
1248
1249         e = pam_getenvlist(handle);
1250         if (!e) {
1251                 pam_code = PAM_BUF_ERR;
1252                 goto fail;
1253         }
1254
1255         /* Block SIGTERM, so that we know that it won't get lost in the child */
1256
1257         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1258
1259         parent_pid = getpid_cached();
1260
1261         r = safe_fork("(sd-pam)", 0, &pam_pid);
1262         if (r < 0)
1263                 goto fail;
1264         if (r == 0) {
1265                 int sig, ret = EXIT_PAM;
1266
1267                 /* The child's job is to reset the PAM session on termination */
1268                 barrier_set_role(&barrier, BARRIER_CHILD);
1269
1270                 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1271                  * those fds are open here that have been opened by PAM. */
1272                 (void) close_many(fds, n_fds);
1273
1274                 /* Drop privileges - we don't need any to pam_close_session and this will make
1275                  * PR_SET_PDEATHSIG work in most cases.  If this fails, ignore the error - but expect sd-pam
1276                  * threads to fail to exit normally */
1277
1278                 r = maybe_setgroups(0, NULL);
1279                 if (r < 0)
1280                         log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1281                 if (setresgid(gid, gid, gid) < 0)
1282                         log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1283                 if (setresuid(uid, uid, uid) < 0)
1284                         log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1285
1286                 (void) ignore_signals(SIGPIPE);
1287
1288                 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1289                  * otherwise the kernel will not allow unprivileged parents kill their privileged children
1290                  * this way. We rely on the control groups kill logic to do the rest for us. */
1291                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1292                         goto child_finish;
1293
1294                 /* Tell the parent that our setup is done. This is especially important regarding dropping
1295                  * privileges. Otherwise, unit setup might race against our setresuid(2) call.
1296                  *
1297                  * If the parent aborted, we'll detect this below, hence ignore return failure here. */
1298                 (void) barrier_place(&barrier);
1299
1300                 /* Check if our parent process might already have died? */
1301                 if (getppid() == parent_pid) {
1302                         sigset_t ss;
1303
1304                         assert_se(sigemptyset(&ss) >= 0);
1305                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1306
1307                         for (;;) {
1308                                 if (sigwait(&ss, &sig) < 0) {
1309                                         if (errno == EINTR)
1310                                                 continue;
1311
1312                                         goto child_finish;
1313                                 }
1314
1315                                 assert(sig == SIGTERM);
1316                                 break;
1317                         }
1318                 }
1319
1320                 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1321                 if (pam_code != PAM_SUCCESS)
1322                         goto child_finish;
1323
1324                 /* If our parent died we'll end the session */
1325                 if (getppid() != parent_pid) {
1326                         pam_code = pam_close_session(handle, flags);
1327                         if (pam_code != PAM_SUCCESS)
1328                                 goto child_finish;
1329                 }
1330
1331                 ret = 0;
1332
1333         child_finish:
1334                 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1335                  * know about this. See pam_end(3) */
1336                 (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
1337                 _exit(ret);
1338         }
1339
1340         barrier_set_role(&barrier, BARRIER_PARENT);
1341
1342         /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1343          * here. */
1344         handle = NULL;
1345
1346         /* Unblock SIGTERM again in the parent */
1347         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1348
1349         /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1350          * this fd around. */
1351         closelog();
1352
1353         /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1354          * recover. However, warn loudly if it happens. */
1355         if (!barrier_place_and_sync(&barrier))
1356                 log_error("PAM initialization failed");
1357
1358         return strv_free_and_replace(*env, e);
1359
1360 fail:
1361         if (pam_code != PAM_SUCCESS) {
1362                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1363                 r = -EPERM;  /* PAM errors do not map to errno */
1364         } else
1365                 log_error_errno(r, "PAM failed: %m");
1366
1367         if (handle) {
1368                 if (close_session)
1369                         pam_code = pam_close_session(handle, flags);
1370
1371                 (void) pam_end(handle, pam_code | flags);
1372         }
1373
1374         closelog();
1375         return r;
1376 #else
1377         return 0;
1378 #endif
1379 }
1380
1381 static void rename_process_from_path(const char *path) {
1382         _cleanup_free_ char *buf = NULL;
1383         const char *p;
1384
1385         assert(path);
1386
1387         /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
1388          * /bin/ps */
1389
1390         if (path_extract_filename(path, &buf) < 0) {
1391                 rename_process("(...)");
1392                 return;
1393         }
1394
1395         size_t l = strlen(buf);
1396         if (l > 8) {
1397                 /* The end of the process name is usually more interesting, since the first bit might just be
1398                  * "systemd-" */
1399                 p = buf + l - 8;
1400                 l = 8;
1401         } else
1402                 p = buf;
1403
1404         char process_name[11];
1405         process_name[0] = '(';
1406         memcpy(process_name+1, p, l);
1407         process_name[1+l] = ')';
1408         process_name[1+l+1] = 0;
1409
1410         rename_process(process_name);
1411 }
1412
1413 static bool context_has_address_families(const ExecContext *c) {
1414         assert(c);
1415
1416         return c->address_families_allow_list ||
1417                 !set_isempty(c->address_families);
1418 }
1419
1420 static bool context_has_syscall_filters(const ExecContext *c) {
1421         assert(c);
1422
1423         return c->syscall_allow_list ||
1424                 !hashmap_isempty(c->syscall_filter);
1425 }
1426
1427 static bool context_has_syscall_logs(const ExecContext *c) {
1428         assert(c);
1429
1430         return c->syscall_log_allow_list ||
1431                 !hashmap_isempty(c->syscall_log);
1432 }
1433
1434 static bool context_has_no_new_privileges(const ExecContext *c) {
1435         assert(c);
1436
1437         if (c->no_new_privileges)
1438                 return true;
1439
1440         if (have_effective_cap(CAP_SYS_ADMIN) > 0) /* if we are privileged, we don't need NNP */
1441                 return false;
1442
1443         /* We need NNP if we have any form of seccomp and are unprivileged */
1444         return c->lock_personality ||
1445                 c->memory_deny_write_execute ||
1446                 c->private_devices ||
1447                 c->protect_clock ||
1448                 c->protect_hostname ||
1449                 c->protect_kernel_tunables ||
1450                 c->protect_kernel_modules ||
1451                 c->protect_kernel_logs ||
1452                 context_has_address_families(c) ||
1453                 exec_context_restrict_namespaces_set(c) ||
1454                 c->restrict_realtime ||
1455                 c->restrict_suid_sgid ||
1456                 !set_isempty(c->syscall_archs) ||
1457                 context_has_syscall_filters(c) ||
1458                 context_has_syscall_logs(c);
1459 }
1460
1461 static bool exec_context_has_credentials(const ExecContext *context) {
1462
1463         assert(context);
1464
1465         return !hashmap_isempty(context->set_credentials) ||
1466                 !hashmap_isempty(context->load_credentials);
1467 }
1468
1469 #if HAVE_SECCOMP
1470
1471 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1472
1473         if (is_seccomp_available())
1474                 return false;
1475
1476         log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1477         return true;
1478 }
1479
1480 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1481         uint32_t negative_action, default_action, action;
1482         int r;
1483
1484         assert(u);
1485         assert(c);
1486
1487         if (!context_has_syscall_filters(c))
1488                 return 0;
1489
1490         if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1491                 return 0;
1492
1493         negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1494
1495         if (c->syscall_allow_list) {
1496                 default_action = negative_action;
1497                 action = SCMP_ACT_ALLOW;
1498         } else {
1499                 default_action = SCMP_ACT_ALLOW;
1500                 action = negative_action;
1501         }
1502
1503         if (needs_ambient_hack) {
1504                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1505                 if (r < 0)
1506                         return r;
1507         }
1508
1509         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1510 }
1511
1512 static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1513 #ifdef SCMP_ACT_LOG
1514         uint32_t default_action, action;
1515 #endif
1516
1517         assert(u);
1518         assert(c);
1519
1520         if (!context_has_syscall_logs(c))
1521                 return 0;
1522
1523 #ifdef SCMP_ACT_LOG
1524         if (skip_seccomp_unavailable(u, "SystemCallLog="))
1525                 return 0;
1526
1527         if (c->syscall_log_allow_list) {
1528                 /* Log nothing but the ones listed */
1529                 default_action = SCMP_ACT_ALLOW;
1530                 action = SCMP_ACT_LOG;
1531         } else {
1532                 /* Log everything but the ones listed */
1533                 default_action = SCMP_ACT_LOG;
1534                 action = SCMP_ACT_ALLOW;
1535         }
1536
1537         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1538 #else
1539         /* old libseccomp */
1540         log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1541         return 0;
1542 #endif
1543 }
1544
1545 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1546         assert(u);
1547         assert(c);
1548
1549         if (set_isempty(c->syscall_archs))
1550                 return 0;
1551
1552         if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1553                 return 0;
1554
1555         return seccomp_restrict_archs(c->syscall_archs);
1556 }
1557
1558 static int apply_address_families(const Unit* u, const ExecContext *c) {
1559         assert(u);
1560         assert(c);
1561
1562         if (!context_has_address_families(c))
1563                 return 0;
1564
1565         if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1566                 return 0;
1567
1568         return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1569 }
1570
1571 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1572         assert(u);
1573         assert(c);
1574
1575         if (!c->memory_deny_write_execute)
1576                 return 0;
1577
1578         if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1579                 return 0;
1580
1581         return seccomp_memory_deny_write_execute();
1582 }
1583
1584 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1585         assert(u);
1586         assert(c);
1587
1588         if (!c->restrict_realtime)
1589                 return 0;
1590
1591         if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1592                 return 0;
1593
1594         return seccomp_restrict_realtime();
1595 }
1596
1597 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1598         assert(u);
1599         assert(c);
1600
1601         if (!c->restrict_suid_sgid)
1602                 return 0;
1603
1604         if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1605                 return 0;
1606
1607         return seccomp_restrict_suid_sgid();
1608 }
1609
1610 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1611         assert(u);
1612         assert(c);
1613
1614         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1615          * let's protect even those systems where this is left on in the kernel. */
1616
1617         if (!c->protect_kernel_tunables)
1618                 return 0;
1619
1620         if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1621                 return 0;
1622
1623         return seccomp_protect_sysctl();
1624 }
1625
1626 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1627         assert(u);
1628         assert(c);
1629
1630         /* Turn off module syscalls on ProtectKernelModules=yes */
1631
1632         if (!c->protect_kernel_modules)
1633                 return 0;
1634
1635         if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1636                 return 0;
1637
1638         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1639 }
1640
1641 static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1642         assert(u);
1643         assert(c);
1644
1645         if (!c->protect_kernel_logs)
1646                 return 0;
1647
1648         if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1649                 return 0;
1650
1651         return seccomp_protect_syslog();
1652 }
1653
1654 static int apply_protect_clock(const Unit *u, const ExecContext *c) {
1655         assert(u);
1656         assert(c);
1657
1658         if (!c->protect_clock)
1659                 return 0;
1660
1661         if (skip_seccomp_unavailable(u, "ProtectClock="))
1662                 return 0;
1663
1664         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1665 }
1666
1667 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1668         assert(u);
1669         assert(c);
1670
1671         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1672
1673         if (!c->private_devices)
1674                 return 0;
1675
1676         if (skip_seccomp_unavailable(u, "PrivateDevices="))
1677                 return 0;
1678
1679         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1680 }
1681
1682 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1683         assert(u);
1684         assert(c);
1685
1686         if (!exec_context_restrict_namespaces_set(c))
1687                 return 0;
1688
1689         if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1690                 return 0;
1691
1692         return seccomp_restrict_namespaces(c->restrict_namespaces);
1693 }
1694
1695 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1696         unsigned long personality;
1697         int r;
1698
1699         assert(u);
1700         assert(c);
1701
1702         if (!c->lock_personality)
1703                 return 0;
1704
1705         if (skip_seccomp_unavailable(u, "LockPersonality="))
1706                 return 0;
1707
1708         personality = c->personality;
1709
1710         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1711         if (personality == PERSONALITY_INVALID) {
1712
1713                 r = opinionated_personality(&personality);
1714                 if (r < 0)
1715                         return r;
1716         }
1717
1718         return seccomp_lock_personality(personality);
1719 }
1720
1721 #endif
1722
1723 #if HAVE_LIBBPF
1724 static int apply_restrict_filesystems(Unit *u, const ExecContext *c) {
1725         assert(u);
1726         assert(c);
1727
1728         if (!exec_context_restrict_filesystems_set(c))
1729                 return 0;
1730
1731         if (!u->manager->restrict_fs) {
1732                 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1733                 log_unit_debug(u, "LSM BPF not supported, skipping RestrictFileSystems=");
1734                 return 0;
1735         }
1736
1737         return lsm_bpf_unit_restrict_filesystems(u, c->restrict_filesystems, c->restrict_filesystems_allow_list);
1738 }
1739 #endif
1740
1741 static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
1742         assert(u);
1743         assert(c);
1744
1745         if (!c->protect_hostname)
1746                 return 0;
1747
1748         if (ns_type_supported(NAMESPACE_UTS)) {
1749                 if (unshare(CLONE_NEWUTS) < 0) {
1750                         if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1751                                 *ret_exit_status = EXIT_NAMESPACE;
1752                                 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1753                         }
1754
1755                         log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1756                 }
1757         } else
1758                 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1759
1760 #if HAVE_SECCOMP
1761         int r;
1762
1763         if (skip_seccomp_unavailable(u, "ProtectHostname="))
1764                 return 0;
1765
1766         r = seccomp_protect_hostname();
1767         if (r < 0) {
1768                 *ret_exit_status = EXIT_SECCOMP;
1769                 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1770         }
1771 #endif
1772
1773         return 0;
1774 }
1775
1776 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1777         assert(idle_pipe);
1778
1779         idle_pipe[1] = safe_close(idle_pipe[1]);
1780         idle_pipe[2] = safe_close(idle_pipe[2]);
1781
1782         if (idle_pipe[0] >= 0) {
1783                 int r;
1784
1785                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1786
1787                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1788                         ssize_t n;
1789
1790                         /* Signal systemd that we are bored and want to continue. */
1791                         n = write(idle_pipe[3], "x", 1);
1792                         if (n > 0)
1793                                 /* Wait for systemd to react to the signal above. */
1794                                 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1795                 }
1796
1797                 idle_pipe[0] = safe_close(idle_pipe[0]);
1798
1799         }
1800
1801         idle_pipe[3] = safe_close(idle_pipe[3]);
1802 }
1803
1804 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1805
1806 static int build_environment(
1807                 const Unit *u,
1808                 const ExecContext *c,
1809                 const ExecParameters *p,
1810                 size_t n_fds,
1811                 char **fdnames,
1812                 const char *home,
1813                 const char *username,
1814                 const char *shell,
1815                 dev_t journal_stream_dev,
1816                 ino_t journal_stream_ino,
1817                 char ***ret) {
1818
1819         _cleanup_strv_free_ char **our_env = NULL;
1820         size_t n_env = 0;
1821         char *x;
1822
1823         assert(u);
1824         assert(c);
1825         assert(p);
1826         assert(ret);
1827
1828 #define N_ENV_VARS 17
1829         our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1830         if (!our_env)
1831                 return -ENOMEM;
1832
1833         if (n_fds > 0) {
1834                 _cleanup_free_ char *joined = NULL;
1835
1836                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1837                         return -ENOMEM;
1838                 our_env[n_env++] = x;
1839
1840                 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1841                         return -ENOMEM;
1842                 our_env[n_env++] = x;
1843
1844                 joined = strv_join(fdnames, ":");
1845                 if (!joined)
1846                         return -ENOMEM;
1847
1848                 x = strjoin("LISTEN_FDNAMES=", joined);
1849                 if (!x)
1850                         return -ENOMEM;
1851                 our_env[n_env++] = x;
1852         }
1853
1854         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1855                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1856                         return -ENOMEM;
1857                 our_env[n_env++] = x;
1858
1859                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1860                         return -ENOMEM;
1861                 our_env[n_env++] = x;
1862         }
1863
1864         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1865          * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1866          * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1867         if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
1868                 x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
1869                 if (!x)
1870                         return -ENOMEM;
1871                 our_env[n_env++] = x;
1872         }
1873
1874         if (home) {
1875                 x = strjoin("HOME=", home);
1876                 if (!x)
1877                         return -ENOMEM;
1878
1879                 path_simplify(x + 5);
1880                 our_env[n_env++] = x;
1881         }
1882
1883         if (username) {
1884                 x = strjoin("LOGNAME=", username);
1885                 if (!x)
1886                         return -ENOMEM;
1887                 our_env[n_env++] = x;
1888
1889                 x = strjoin("USER=", username);
1890                 if (!x)
1891                         return -ENOMEM;
1892                 our_env[n_env++] = x;
1893         }
1894
1895         if (shell) {
1896                 x = strjoin("SHELL=", shell);
1897                 if (!x)
1898                         return -ENOMEM;
1899
1900                 path_simplify(x + 6);
1901                 our_env[n_env++] = x;
1902         }
1903
1904         if (!sd_id128_is_null(u->invocation_id)) {
1905                 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1906                         return -ENOMEM;
1907
1908                 our_env[n_env++] = x;
1909         }
1910
1911         if (exec_context_needs_term(c)) {
1912                 const char *tty_path, *term = NULL;
1913
1914                 tty_path = exec_context_tty_path(c);
1915
1916                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1917                  * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1918                  * container manager passes to PID 1 ends up all the way in the console login shown. */
1919
1920                 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
1921                         term = getenv("TERM");
1922
1923                 if (!term)
1924                         term = default_term_for_tty(tty_path);
1925
1926                 x = strjoin("TERM=", term);
1927                 if (!x)
1928                         return -ENOMEM;
1929                 our_env[n_env++] = x;
1930         }
1931
1932         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1933                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1934                         return -ENOMEM;
1935
1936                 our_env[n_env++] = x;
1937         }
1938
1939         if (c->log_namespace) {
1940                 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1941                 if (!x)
1942                         return -ENOMEM;
1943
1944                 our_env[n_env++] = x;
1945         }
1946
1947         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1948                 _cleanup_free_ char *joined = NULL;
1949                 const char *n;
1950
1951                 if (!p->prefix[t])
1952                         continue;
1953
1954                 if (c->directories[t].n_items == 0)
1955                         continue;
1956
1957                 n = exec_directory_env_name_to_string(t);
1958                 if (!n)
1959                         continue;
1960
1961                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
1962                         _cleanup_free_ char *prefixed = NULL;
1963
1964                         prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
1965                         if (!prefixed)
1966                                 return -ENOMEM;
1967
1968                         if (!strextend_with_separator(&joined, ":", prefixed))
1969                                 return -ENOMEM;
1970                 }
1971
1972                 x = strjoin(n, "=", joined);
1973                 if (!x)
1974                         return -ENOMEM;
1975
1976                 our_env[n_env++] = x;
1977         }
1978
1979         if (exec_context_has_credentials(c) && p->prefix[EXEC_DIRECTORY_RUNTIME]) {
1980                 x = strjoin("CREDENTIALS_DIRECTORY=", p->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
1981                 if (!x)
1982                         return -ENOMEM;
1983
1984                 our_env[n_env++] = x;
1985         }
1986
1987         if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
1988                 return -ENOMEM;
1989
1990         our_env[n_env++] = x;
1991
1992         our_env[n_env++] = NULL;
1993         assert(n_env <= N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1994 #undef N_ENV_VARS
1995
1996         *ret = TAKE_PTR(our_env);
1997
1998         return 0;
1999 }
2000
2001 static int build_pass_environment(const ExecContext *c, char ***ret) {
2002         _cleanup_strv_free_ char **pass_env = NULL;
2003         size_t n_env = 0;
2004
2005         STRV_FOREACH(i, c->pass_environment) {
2006                 _cleanup_free_ char *x = NULL;
2007                 char *v;
2008
2009                 v = getenv(*i);
2010                 if (!v)
2011                         continue;
2012                 x = strjoin(*i, "=", v);
2013                 if (!x)
2014                         return -ENOMEM;
2015
2016                 if (!GREEDY_REALLOC(pass_env, n_env + 2))
2017                         return -ENOMEM;
2018
2019                 pass_env[n_env++] = TAKE_PTR(x);
2020                 pass_env[n_env] = NULL;
2021         }
2022
2023         *ret = TAKE_PTR(pass_env);
2024
2025         return 0;
2026 }
2027
2028 bool exec_needs_mount_namespace(
2029                 const ExecContext *context,
2030                 const ExecParameters *params,
2031                 const ExecRuntime *runtime) {
2032
2033         assert(context);
2034
2035         if (context->root_image)
2036                 return true;
2037
2038         if (!strv_isempty(context->read_write_paths) ||
2039             !strv_isempty(context->read_only_paths) ||
2040             !strv_isempty(context->inaccessible_paths) ||
2041             !strv_isempty(context->exec_paths) ||
2042             !strv_isempty(context->no_exec_paths))
2043                 return true;
2044
2045         if (context->n_bind_mounts > 0)
2046                 return true;
2047
2048         if (context->n_temporary_filesystems > 0)
2049                 return true;
2050
2051         if (context->n_mount_images > 0)
2052                 return true;
2053
2054         if (context->n_extension_images > 0)
2055                 return true;
2056
2057         if (!strv_isempty(context->extension_directories))
2058                 return true;
2059
2060         if (!IN_SET(context->mount_flags, 0, MS_SHARED))
2061                 return true;
2062
2063         if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
2064                 return true;
2065
2066         if (context->private_devices ||
2067             context->private_mounts ||
2068             context->protect_system != PROTECT_SYSTEM_NO ||
2069             context->protect_home != PROTECT_HOME_NO ||
2070             context->protect_kernel_tunables ||
2071             context->protect_kernel_modules ||
2072             context->protect_kernel_logs ||
2073             context->protect_control_groups ||
2074             context->protect_proc != PROTECT_PROC_DEFAULT ||
2075             context->proc_subset != PROC_SUBSET_ALL ||
2076             context->private_ipc ||
2077             context->ipc_namespace_path)
2078                 return true;
2079
2080         if (context->root_directory) {
2081                 if (exec_context_get_effective_mount_apivfs(context))
2082                         return true;
2083
2084                 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2085                         if (params && !params->prefix[t])
2086                                 continue;
2087
2088                         if (context->directories[t].n_items > 0)
2089                                 return true;
2090                 }
2091         }
2092
2093         if (context->dynamic_user &&
2094             (context->directories[EXEC_DIRECTORY_STATE].n_items > 0 ||
2095              context->directories[EXEC_DIRECTORY_CACHE].n_items > 0 ||
2096              context->directories[EXEC_DIRECTORY_LOGS].n_items > 0))
2097                 return true;
2098
2099         if (context->log_namespace)
2100                 return true;
2101
2102         return false;
2103 }
2104
2105 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
2106         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2107         _cleanup_close_pair_ int errno_pipe[2] = PIPE_EBADF;
2108         _cleanup_close_ int unshare_ready_fd = -EBADF;
2109         _cleanup_(sigkill_waitp) pid_t pid = 0;
2110         uint64_t c = 1;
2111         ssize_t n;
2112         int r;
2113
2114         /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2115          * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2116          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2117          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2118          * which waits for the parent to create the new user namespace while staying in the original namespace. The
2119          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2120          * continues execution normally.
2121          * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2122          * does not need CAP_SETUID to write the single line mapping to itself. */
2123
2124         /* Can only set up multiple mappings with CAP_SETUID. */
2125         if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid))
2126                 r = asprintf(&uid_map,
2127                              UID_FMT " " UID_FMT " 1\n"     /* Map $OUID → $OUID */
2128                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
2129                              ouid, ouid, uid, uid);
2130         else
2131                 r = asprintf(&uid_map,
2132                              UID_FMT " " UID_FMT " 1\n",    /* Map $OUID → $OUID */
2133                              ouid, ouid);
2134
2135         if (r < 0)
2136                 return -ENOMEM;
2137
2138         /* Can only set up multiple mappings with CAP_SETGID. */
2139         if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid))
2140                 r = asprintf(&gid_map,
2141                              GID_FMT " " GID_FMT " 1\n"     /* Map $OGID → $OGID */
2142                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
2143                              ogid, ogid, gid, gid);
2144         else
2145                 r = asprintf(&gid_map,
2146                              GID_FMT " " GID_FMT " 1\n",    /* Map $OGID -> $OGID */
2147                              ogid, ogid);
2148
2149         if (r < 0)
2150                 return -ENOMEM;
2151
2152         /* Create a communication channel so that the parent can tell the child when it finished creating the user
2153          * namespace. */
2154         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2155         if (unshare_ready_fd < 0)
2156                 return -errno;
2157
2158         /* Create a communication channel so that the child can tell the parent a proper error code in case it
2159          * failed. */
2160         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2161                 return -errno;
2162
2163         r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2164         if (r < 0)
2165                 return r;
2166         if (r == 0) {
2167                 _cleanup_close_ int fd = -EBADF;
2168                 const char *a;
2169                 pid_t ppid;
2170
2171                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2172                  * here, after the parent opened its own user namespace. */
2173
2174                 ppid = getppid();
2175                 errno_pipe[0] = safe_close(errno_pipe[0]);
2176
2177                 /* Wait until the parent unshared the user namespace */
2178                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2179                         r = -errno;
2180                         goto child_fail;
2181                 }
2182
2183                 /* Disable the setgroups() system call in the child user namespace, for good. */
2184                 a = procfs_file_alloca(ppid, "setgroups");
2185                 fd = open(a, O_WRONLY|O_CLOEXEC);
2186                 if (fd < 0) {
2187                         if (errno != ENOENT) {
2188                                 r = -errno;
2189                                 goto child_fail;
2190                         }
2191
2192                         /* If the file is missing the kernel is too old, let's continue anyway. */
2193                 } else {
2194                         if (write(fd, "deny\n", 5) < 0) {
2195                                 r = -errno;
2196                                 goto child_fail;
2197                         }
2198
2199                         fd = safe_close(fd);
2200                 }
2201
2202                 /* First write the GID map */
2203                 a = procfs_file_alloca(ppid, "gid_map");
2204                 fd = open(a, O_WRONLY|O_CLOEXEC);
2205                 if (fd < 0) {
2206                         r = -errno;
2207                         goto child_fail;
2208                 }
2209                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2210                         r = -errno;
2211                         goto child_fail;
2212                 }
2213                 fd = safe_close(fd);
2214
2215                 /* The write the UID map */
2216                 a = procfs_file_alloca(ppid, "uid_map");
2217                 fd = open(a, O_WRONLY|O_CLOEXEC);
2218                 if (fd < 0) {
2219                         r = -errno;
2220                         goto child_fail;
2221                 }
2222                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2223                         r = -errno;
2224                         goto child_fail;
2225                 }
2226
2227                 _exit(EXIT_SUCCESS);
2228
2229         child_fail:
2230                 (void) write(errno_pipe[1], &r, sizeof(r));
2231                 _exit(EXIT_FAILURE);
2232         }
2233
2234         errno_pipe[1] = safe_close(errno_pipe[1]);
2235
2236         if (unshare(CLONE_NEWUSER) < 0)
2237                 return -errno;
2238
2239         /* Let the child know that the namespace is ready now */
2240         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2241                 return -errno;
2242
2243         /* Try to read an error code from the child */
2244         n = read(errno_pipe[0], &r, sizeof(r));
2245         if (n < 0)
2246                 return -errno;
2247         if (n == sizeof(r)) { /* an error code was sent to us */
2248                 if (r < 0)
2249                         return r;
2250                 return -EIO;
2251         }
2252         if (n != 0) /* on success we should have read 0 bytes */
2253                 return -EIO;
2254
2255         r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
2256         if (r < 0)
2257                 return r;
2258         if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2259                 return -EIO;
2260
2261         return 0;
2262 }
2263
2264 static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2265         if (!context->dynamic_user)
2266                 return false;
2267
2268         if (type == EXEC_DIRECTORY_CONFIGURATION)
2269                 return false;
2270
2271         if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2272                 return false;
2273
2274         return true;
2275 }
2276
2277 static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2278         _cleanup_free_ char *src_abs = NULL;
2279         int r;
2280
2281         assert(source);
2282
2283         src_abs = path_join(root, source);
2284         if (!src_abs)
2285                 return -ENOMEM;
2286
2287         STRV_FOREACH(dst, symlinks) {
2288                 _cleanup_free_ char *dst_abs = NULL;
2289
2290                 dst_abs = path_join(root, *dst);
2291                 if (!dst_abs)
2292                         return -ENOMEM;
2293
2294                 r = mkdir_parents_label(dst_abs, 0755);
2295                 if (r < 0)
2296                         return r;
2297
2298                 r = symlink_idempotent(src_abs, dst_abs, true);
2299                 if (r < 0)
2300                         return r;
2301         }
2302
2303         return 0;
2304 }
2305
2306 static int setup_exec_directory(
2307                 const ExecContext *context,
2308                 const ExecParameters *params,
2309                 uid_t uid,
2310                 gid_t gid,
2311                 ExecDirectoryType type,
2312                 bool needs_mount_namespace,
2313                 int *exit_status) {
2314
2315         static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2316                 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2317                 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2318                 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2319                 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2320                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2321         };
2322         int r;
2323
2324         assert(context);
2325         assert(params);
2326         assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2327         assert(exit_status);
2328
2329         if (!params->prefix[type])
2330                 return 0;
2331
2332         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2333                 if (!uid_is_valid(uid))
2334                         uid = 0;
2335                 if (!gid_is_valid(gid))
2336                         gid = 0;
2337         }
2338
2339         for (size_t i = 0; i < context->directories[type].n_items; i++) {
2340                 _cleanup_free_ char *p = NULL, *pp = NULL;
2341
2342                 p = path_join(params->prefix[type], context->directories[type].items[i].path);
2343                 if (!p) {
2344                         r = -ENOMEM;
2345                         goto fail;
2346                 }
2347
2348                 r = mkdir_parents_label(p, 0755);
2349                 if (r < 0)
2350                         goto fail;
2351
2352                 if (exec_directory_is_private(context, type)) {
2353                         /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2354                          * case we want to avoid leaving a directory around fully accessible that is owned by
2355                          * a dynamic user whose UID is later on reused. To lock this down we use the same
2356                          * trick used by container managers to prohibit host users to get access to files of
2357                          * the same UID in containers: we place everything inside a directory that has an
2358                          * access mode of 0700 and is owned root:root, so that it acts as security boundary
2359                          * for unprivileged host code. We then use fs namespacing to make this directory
2360                          * permeable for the service itself.
2361                          *
2362                          * Specifically: for a service which wants a special directory "foo/" we first create
2363                          * a directory "private/" with access mode 0700 owned by root:root. Then we place
2364                          * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2365                          * "private/foo". This way, privileged host users can access "foo/" as usual, but
2366                          * unprivileged host users can't look into it. Inside of the namespace of the unit
2367                          * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2368                          * "private/foo/" is mounted under the same name, thus disabling the access boundary
2369                          * for the service and making sure it only gets access to the dirs it needs but no
2370                          * others. Tricky? Yes, absolutely, but it works!
2371                          *
2372                          * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2373                          * to be owned by the service itself.
2374                          *
2375                          * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2376                          * for sharing files or sockets with other services. */
2377
2378                         pp = path_join(params->prefix[type], "private");
2379                         if (!pp) {
2380                                 r = -ENOMEM;
2381                                 goto fail;
2382                         }
2383
2384                         /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2385                         r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
2386                         if (r < 0)
2387                                 goto fail;
2388
2389                         if (!path_extend(&pp, context->directories[type].items[i].path)) {
2390                                 r = -ENOMEM;
2391                                 goto fail;
2392                         }
2393
2394                         /* Create all directories between the configured directory and this private root, and mark them 0755 */
2395                         r = mkdir_parents_label(pp, 0755);
2396                         if (r < 0)
2397                                 goto fail;
2398
2399                         if (is_dir(p, false) > 0 &&
2400                             (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2401
2402                                 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2403                                  * it over. Most likely the service has been upgraded from one that didn't use
2404                                  * DynamicUser=1, to one that does. */
2405
2406                                 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2407                                          "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2408                                          exec_directory_type_to_string(type), p, pp);
2409
2410                                 if (rename(p, pp) < 0) {
2411                                         r = -errno;
2412                                         goto fail;
2413                                 }
2414                         } else {
2415                                 /* Otherwise, create the actual directory for the service */
2416
2417                                 r = mkdir_label(pp, context->directories[type].mode);
2418                                 if (r < 0 && r != -EEXIST)
2419                                         goto fail;
2420                         }
2421
2422                         if (!context->directories[type].items[i].only_create) {
2423                                 /* And link it up from the original place.
2424                                  * Notes
2425                                  * 1) If a mount namespace is going to be used, then this symlink remains on
2426                                  *    the host, and a new one for the child namespace will be created later.
2427                                  * 2) It is not necessary to create this symlink when one of its parent
2428                                  *    directories is specified and already created. E.g.
2429                                  *        StateDirectory=foo foo/bar
2430                                  *    In that case, the inode points to pp and p for "foo/bar" are the same:
2431                                  *        pp = "/var/lib/private/foo/bar"
2432                                  *        p = "/var/lib/foo/bar"
2433                                  *    and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2434                                  *    we do not need to create the symlink, but we cannot create the symlink.
2435                                  *    See issue #24783. */
2436                                 r = symlink_idempotent(pp, p, true);
2437                                 if (r < 0)
2438                                         goto fail;
2439                         }
2440
2441                 } else {
2442                         _cleanup_free_ char *target = NULL;
2443
2444                         if (type != EXEC_DIRECTORY_CONFIGURATION &&
2445                             readlink_and_make_absolute(p, &target) >= 0) {
2446                                 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2447
2448                                 /* This already exists and is a symlink? Interesting. Maybe it's one created
2449                                  * by DynamicUser=1 (see above)?
2450                                  *
2451                                  * We do this for all directory types except for ConfigurationDirectory=,
2452                                  * since they all support the private/ symlink logic at least in some
2453                                  * configurations, see above. */
2454
2455                                 r = chase_symlinks(target, NULL, 0, &target_resolved, NULL);
2456                                 if (r < 0)
2457                                         goto fail;
2458
2459                                 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
2460                                 if (!q) {
2461                                         r = -ENOMEM;
2462                                         goto fail;
2463                                 }
2464
2465                                 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2466                                 r = chase_symlinks(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2467                                 if (r < 0)
2468                                         goto fail;
2469
2470                                 if (path_equal(q_resolved, target_resolved)) {
2471
2472                                         /* Hmm, apparently DynamicUser= was once turned on for this service,
2473                                          * but is no longer. Let's move the directory back up. */
2474
2475                                         log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2476                                                  "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2477                                                  exec_directory_type_to_string(type), q, p);
2478
2479                                         if (unlink(p) < 0) {
2480                                                 r = -errno;
2481                                                 goto fail;
2482                                         }
2483
2484                                         if (rename(q, p) < 0) {
2485                                                 r = -errno;
2486                                                 goto fail;
2487                                         }
2488                                 }
2489                         }
2490
2491                         r = mkdir_label(p, context->directories[type].mode);
2492                         if (r < 0) {
2493                                 if (r != -EEXIST)
2494                                         goto fail;
2495
2496                                 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2497                                         struct stat st;
2498
2499                                         /* Don't change the owner/access mode of the configuration directory,
2500                                          * as in the common case it is not written to by a service, and shall
2501                                          * not be writable. */
2502
2503                                         if (stat(p, &st) < 0) {
2504                                                 r = -errno;
2505                                                 goto fail;
2506                                         }
2507
2508                                         /* Still complain if the access mode doesn't match */
2509                                         if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2510                                                 log_warning("%s \'%s\' already exists but the mode is different. "
2511                                                             "(File system: %o %sMode: %o)",
2512                                                             exec_directory_type_to_string(type), context->directories[type].items[i].path,
2513                                                             st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2514
2515                                         continue;
2516                                 }
2517                         }
2518                 }
2519
2520                 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2521                  * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2522                  * current UID/GID ownership.) */
2523                 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2524                 if (r < 0)
2525                         goto fail;
2526
2527                 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2528                  * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2529                  * assignments to exist. */
2530                 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
2531                 if (r < 0)
2532                         goto fail;
2533         }
2534
2535         /* If we are not going to run in a namespace, set up the symlinks - otherwise
2536          * they are set up later, to allow configuring empty var/run/etc. */
2537         if (!needs_mount_namespace)
2538                 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2539                         r = create_many_symlinks(params->prefix[type],
2540                                                  context->directories[type].items[i].path,
2541                                                  context->directories[type].items[i].symlinks);
2542                         if (r < 0)
2543                                 goto fail;
2544                 }
2545
2546         return 0;
2547
2548 fail:
2549         *exit_status = exit_status_table[type];
2550         return r;
2551 }
2552
2553 static int write_credential(
2554                 int dfd,
2555                 const char *id,
2556                 const void *data,
2557                 size_t size,
2558                 uid_t uid,
2559                 bool ownership_ok) {
2560
2561         _cleanup_(unlink_and_freep) char *tmp = NULL;
2562         _cleanup_close_ int fd = -EBADF;
2563         int r;
2564
2565         r = tempfn_random_child("", "cred", &tmp);
2566         if (r < 0)
2567                 return r;
2568
2569         fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
2570         if (fd < 0) {
2571                 tmp = mfree(tmp);
2572                 return -errno;
2573         }
2574
2575         r = loop_write(fd, data, size, /* do_poll = */ false);
2576         if (r < 0)
2577                 return r;
2578
2579         if (fchmod(fd, 0400) < 0) /* Take away "w" bit */
2580                 return -errno;
2581
2582         if (uid_is_valid(uid) && uid != getuid()) {
2583                 r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
2584                 if (r < 0) {
2585                         if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2586                                 return r;
2587
2588                         if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want
2589                                             * to express: that the user gets read access and nothing
2590                                             * else. But if the backing fs can't support that (e.g. ramfs)
2591                                             * then we can use file ownership instead. But that's only safe if
2592                                             * we can then re-mount the whole thing read-only, so that the
2593                                             * user can no longer chmod() the file to gain write access. */
2594                                 return r;
2595
2596                         if (fchown(fd, uid, GID_INVALID) < 0)
2597                                 return -errno;
2598                 }
2599         }
2600
2601         if (renameat(dfd, tmp, dfd, id) < 0)
2602                 return -errno;
2603
2604         tmp = mfree(tmp);
2605         return 0;
2606 }
2607
2608 static char **credential_search_path(
2609                 const ExecParameters *params,
2610                 bool encrypted) {
2611
2612         _cleanup_strv_free_ char **l = NULL;
2613
2614         assert(params);
2615
2616         /* Assemble a search path to find credentials in. We'll look in /etc/credstore/ (and similar
2617          * directories in /usr/lib/ + /run/) for all types of credentials. If we are looking for encrypted
2618          * credentials, also look in /etc/credstore.encrypted/ (and similar dirs). */
2619
2620         if (encrypted) {
2621                 if (strv_extend(&l, params->received_encrypted_credentials_directory) < 0)
2622                         return NULL;
2623
2624                 if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore.encrypted"), /* filter_duplicates= */ true) < 0)
2625                         return NULL;
2626         }
2627
2628         if (params->received_credentials_directory)
2629                 if (strv_extend(&l, params->received_credentials_directory) < 0)
2630                         return NULL;
2631
2632         if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore"), /* filter_duplicates= */ true) < 0)
2633                 return NULL;
2634
2635         if (DEBUG_LOGGING) {
2636                 _cleanup_free_ char *t = strv_join(l, ":");
2637
2638                 log_debug("Credential search path is: %s", t);
2639         }
2640
2641         return TAKE_PTR(l);
2642 }
2643
2644 static int load_credential(
2645                 const ExecContext *context,
2646                 const ExecParameters *params,
2647                 const char *id,
2648                 const char *path,
2649                 bool encrypted,
2650                 const char *unit,
2651                 int read_dfd,
2652                 int write_dfd,
2653                 uid_t uid,
2654                 bool ownership_ok,
2655                 uint64_t *left) {
2656
2657         ReadFullFileFlags flags = READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER;
2658         _cleanup_strv_free_ char **search_path = NULL;
2659         _cleanup_(erase_and_freep) char *data = NULL;
2660         _cleanup_free_ char *bindname = NULL;
2661         const char *source = NULL;
2662         bool missing_ok = true;
2663         size_t size, add, maxsz;
2664         int r;
2665
2666         assert(context);
2667         assert(params);
2668         assert(id);
2669         assert(path);
2670         assert(unit);
2671         assert(read_dfd >= 0 || read_dfd == AT_FDCWD);
2672         assert(write_dfd >= 0);
2673         assert(left);
2674
2675         if (read_dfd >= 0) {
2676                 /* If a directory fd is specified, then read the file directly from that dir. In this case we
2677                  * won't do AF_UNIX stuff (we simply don't want to recursively iterate down a tree of AF_UNIX
2678                  * IPC sockets). It's OK if a file vanishes here in the time we enumerate it and intend to
2679                  * open it. */
2680
2681                 if (!filename_is_valid(path)) /* safety check */
2682                         return -EINVAL;
2683
2684                 missing_ok = true;
2685                 source = path;
2686
2687         } else if (path_is_absolute(path)) {
2688                 /* If this is an absolute path, read the data directly from it, and support AF_UNIX
2689                  * sockets */
2690
2691                 if (!path_is_valid(path)) /* safety check */
2692                         return -EINVAL;
2693
2694                 flags |= READ_FULL_FILE_CONNECT_SOCKET;
2695
2696                 /* Pass some minimal info about the unit and the credential name we are looking to acquire
2697                  * via the source socket address in case we read off an AF_UNIX socket. */
2698                 if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, id) < 0)
2699                         return -ENOMEM;
2700
2701                 missing_ok = false;
2702                 source = path;
2703
2704         } else if (credential_name_valid(path)) {
2705                 /* If this is a relative path, take it as credential name relative to the credentials
2706                  * directory we received ourselves. We don't support the AF_UNIX stuff in this mode, since we
2707                  * are operating on a credential store, i.e. this is guaranteed to be regular files. */
2708
2709                 search_path = credential_search_path(params, encrypted);
2710                 if (!search_path)
2711                         return -ENOMEM;
2712
2713                 missing_ok = true;
2714         } else
2715                 source = NULL;
2716
2717         if (encrypted)
2718                 flags |= READ_FULL_FILE_UNBASE64;
2719
2720         maxsz = encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX;
2721
2722         if (search_path) {
2723                 STRV_FOREACH(d, search_path) {
2724                         _cleanup_free_ char *j = NULL;
2725
2726                         j = path_join(*d, path);
2727                         if (!j)
2728                                 return -ENOMEM;
2729
2730                         r = read_full_file_full(
2731                                         AT_FDCWD, j, /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
2732                                         UINT64_MAX,
2733                                         maxsz,
2734                                         flags,
2735                                         NULL,
2736                                         &data, &size);
2737                         if (r != -ENOENT)
2738                                 break;
2739                 }
2740         } else if (source)
2741                 r = read_full_file_full(
2742                                 read_dfd, source,
2743                                 UINT64_MAX,
2744                                 maxsz,
2745                                 flags,
2746                                 bindname,
2747                                 &data, &size);
2748         else
2749                 r = -ENOENT;
2750
2751         if (r == -ENOENT && (missing_ok || hashmap_contains(context->set_credentials, id))) {
2752                 /* Make a missing inherited credential non-fatal, let's just continue. After all apps
2753                  * will get clear errors if we don't pass such a missing credential on as they
2754                  * themselves will get ENOENT when trying to read them, which should not be much
2755                  * worse than when we handle the error here and make it fatal.
2756                  *
2757                  * Also, if the source file doesn't exist, but a fallback is set via SetCredentials=
2758                  * we are fine, too. */
2759                 log_debug_errno(r, "Couldn't read inherited credential '%s', skipping: %m", path);
2760                 return 0;
2761         }
2762         if (r < 0)
2763                 return log_debug_errno(r, "Failed to read credential '%s': %m", path);
2764
2765         if (encrypted) {
2766                 _cleanup_free_ void *plaintext = NULL;
2767                 size_t plaintext_size = 0;
2768
2769                 r = decrypt_credential_and_warn(id, now(CLOCK_REALTIME), NULL, NULL, data, size, &plaintext, &plaintext_size);
2770                 if (r < 0)
2771                         return r;
2772
2773                 free_and_replace(data, plaintext);
2774                 size = plaintext_size;
2775         }
2776
2777         add = strlen(id) + size;
2778         if (add > *left)
2779                 return -E2BIG;
2780
2781         r = write_credential(write_dfd, id, data, size, uid, ownership_ok);
2782         if (r < 0)
2783                 return log_debug_errno(r, "Failed to write credential '%s': %m", id);
2784
2785         *left -= add;
2786         return 0;
2787 }
2788
2789 struct load_cred_args {
2790         const ExecContext *context;
2791         const ExecParameters *params;
2792         bool encrypted;
2793         const char *unit;
2794         int dfd;
2795         uid_t uid;
2796         bool ownership_ok;
2797         uint64_t *left;
2798 };
2799
2800 static int load_cred_recurse_dir_cb(
2801                 RecurseDirEvent event,
2802                 const char *path,
2803                 int dir_fd,
2804                 int inode_fd,
2805                 const struct dirent *de,
2806                 const struct statx *sx,
2807                 void *userdata) {
2808
2809         struct load_cred_args *args = ASSERT_PTR(userdata);
2810         _cleanup_free_ char *sub_id = NULL;
2811         int r;
2812
2813         if (event != RECURSE_DIR_ENTRY)
2814                 return RECURSE_DIR_CONTINUE;
2815
2816         if (!IN_SET(de->d_type, DT_REG, DT_SOCK))
2817                 return RECURSE_DIR_CONTINUE;
2818
2819         sub_id = strreplace(path, "/", "_");
2820         if (!sub_id)
2821                 return -ENOMEM;
2822
2823         if (!credential_name_valid(sub_id))
2824                 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Credential would get ID %s, which is not valid, refusing", sub_id);
2825
2826         if (faccessat(args->dfd, sub_id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0) {
2827                 log_debug("Skipping credential with duplicated ID %s at %s", sub_id, path);
2828                 return RECURSE_DIR_CONTINUE;
2829         }
2830         if (errno != ENOENT)
2831                 return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sub_id);
2832
2833         r = load_credential(
2834                         args->context,
2835                         args->params,
2836                         sub_id,
2837                         de->d_name,
2838                         args->encrypted,
2839                         args->unit,
2840                         dir_fd,
2841                         args->dfd,
2842                         args->uid,
2843                         args->ownership_ok,
2844                         args->left);
2845         if (r < 0)
2846                 return r;
2847
2848         return RECURSE_DIR_CONTINUE;
2849 }
2850
2851 static int acquire_credentials(
2852                 const ExecContext *context,
2853                 const ExecParameters *params,
2854                 const char *unit,
2855                 const char *p,
2856                 uid_t uid,
2857                 bool ownership_ok) {
2858
2859         uint64_t left = CREDENTIALS_TOTAL_SIZE_MAX;
2860         _cleanup_close_ int dfd = -EBADF;
2861         ExecLoadCredential *lc;
2862         ExecSetCredential *sc;
2863         int r;
2864
2865         assert(context);
2866         assert(p);
2867
2868         dfd = open(p, O_DIRECTORY|O_CLOEXEC);
2869         if (dfd < 0)
2870                 return -errno;
2871
2872         /* First, load credentials off disk (or acquire via AF_UNIX socket) */
2873         HASHMAP_FOREACH(lc, context->load_credentials) {
2874                 _cleanup_close_ int sub_fd = -EBADF;
2875
2876                 /* If this is an absolute path, then try to open it as a directory. If that works, then we'll
2877                  * recurse into it. If it is an absolute path but it isn't a directory, then we'll open it as
2878                  * a regular file. Finally, if it's a relative path we will use it as a credential name to
2879                  * propagate a credential passed to us from further up. */
2880
2881                 if (path_is_absolute(lc->path)) {
2882                         sub_fd = open(lc->path, O_DIRECTORY|O_CLOEXEC|O_RDONLY);
2883                         if (sub_fd < 0 && !IN_SET(errno,
2884                                                   ENOTDIR,  /* Not a directory */
2885                                                   ENOENT))  /* Doesn't exist? */
2886                                 return log_debug_errno(errno, "Failed to open '%s': %m", lc->path);
2887                 }
2888
2889                 if (sub_fd < 0)
2890                         /* Regular file (incl. a credential passed in from higher up) */
2891                         r = load_credential(
2892                                         context,
2893                                         params,
2894                                         lc->id,
2895                                         lc->path,
2896                                         lc->encrypted,
2897                                         unit,
2898                                         AT_FDCWD,
2899                                         dfd,
2900                                         uid,
2901                                         ownership_ok,
2902                                         &left);
2903                 else
2904                         /* Directory */
2905                         r = recurse_dir(
2906                                         sub_fd,
2907                                         /* path= */ lc->id, /* recurse_dir() will suffix the subdir paths from here to the top-level id */
2908                                         /* statx_mask= */ 0,
2909                                         /* n_depth_max= */ UINT_MAX,
2910                                         RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT|RECURSE_DIR_ENSURE_TYPE,
2911                                         load_cred_recurse_dir_cb,
2912                                         &(struct load_cred_args) {
2913                                                 .context = context,
2914                                                 .params = params,
2915                                                 .encrypted = lc->encrypted,
2916                                                 .unit = unit,
2917                                                 .dfd = dfd,
2918                                                 .uid = uid,
2919                                                 .ownership_ok = ownership_ok,
2920                                                 .left = &left,
2921                                         });
2922                 if (r < 0)
2923                         return r;
2924         }
2925
2926         /* Second, we add in literally specified credentials. If the credentials already exist, we'll not add
2927          * them, so that they can act as a "default" if the same credential is specified multiple times. */
2928         HASHMAP_FOREACH(sc, context->set_credentials) {
2929                 _cleanup_(erase_and_freep) void *plaintext = NULL;
2930                 const char *data;
2931                 size_t size, add;
2932
2933                 /* Note that we check ahead of time here instead of relying on O_EXCL|O_CREAT later to return
2934                  * EEXIST if the credential already exists. That's because the TPM2-based decryption is kinda
2935                  * slow and involved, hence it's nice to be able to skip that if the credential already
2936                  * exists anyway. */
2937                 if (faccessat(dfd, sc->id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0)
2938                         continue;
2939                 if (errno != ENOENT)
2940                         return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sc->id);
2941
2942                 if (sc->encrypted) {
2943                         r = decrypt_credential_and_warn(sc->id, now(CLOCK_REALTIME), NULL, NULL, sc->data, sc->size, &plaintext, &size);
2944                         if (r < 0)
2945                                 return r;
2946
2947                         data = plaintext;
2948                 } else {
2949                         data = sc->data;
2950                         size = sc->size;
2951                 }
2952
2953                 add = strlen(sc->id) + size;
2954                 if (add > left)
2955                         return -E2BIG;
2956
2957                 r = write_credential(dfd, sc->id, data, size, uid, ownership_ok);
2958                 if (r < 0)
2959                         return r;
2960
2961                 left -= add;
2962         }
2963
2964         if (fchmod(dfd, 0500) < 0) /* Now take away the "w" bit */
2965                 return -errno;
2966
2967         /* After we created all keys with the right perms, also make sure the credential store as a whole is
2968          * accessible */
2969
2970         if (uid_is_valid(uid) && uid != getuid()) {
2971                 r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE);
2972                 if (r < 0) {
2973                         if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2974                                 return r;
2975
2976                         if (!ownership_ok)
2977                                 return r;
2978
2979                         if (fchown(dfd, uid, GID_INVALID) < 0)
2980                                 return -errno;
2981                 }
2982         }
2983
2984         return 0;
2985 }
2986
2987 static int setup_credentials_internal(
2988                 const ExecContext *context,
2989                 const ExecParameters *params,
2990                 const char *unit,
2991                 const char *final,        /* This is where the credential store shall eventually end up at */
2992                 const char *workspace,    /* This is where we can prepare it before moving it to the final place */
2993                 bool reuse_workspace,     /* Whether to reuse any existing workspace mount if it already is a mount */
2994                 bool must_mount,          /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
2995                 uid_t uid) {
2996
2997         int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
2998                                    * if we mounted something; false if we definitely can't mount anything */
2999         bool final_mounted;
3000         const char *where;
3001
3002         assert(context);
3003         assert(final);
3004         assert(workspace);
3005
3006         if (reuse_workspace) {
3007                 r = path_is_mount_point(workspace, NULL, 0);
3008                 if (r < 0)
3009                         return r;
3010                 if (r > 0)
3011                         workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse it, let's keep this in mind */
3012                 else
3013                         workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
3014         } else
3015                 workspace_mounted = -1; /* ditto */
3016
3017         r = path_is_mount_point(final, NULL, 0);
3018         if (r < 0)
3019                 return r;
3020         if (r > 0) {
3021                 /* If the final place already has something mounted, we use that. If the workspace also has
3022                  * something mounted we assume it's actually the same mount (but with MS_RDONLY
3023                  * different). */
3024                 final_mounted = true;
3025
3026                 if (workspace_mounted < 0) {
3027                         /* If the final place is mounted, but the workspace we isn't, then let's bind mount
3028                          * the final version to the workspace, and make it writable, so that we can make
3029                          * changes */
3030
3031                         r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
3032                         if (r < 0)
3033                                 return r;
3034
3035                         r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
3036                         if (r < 0)
3037                                 return r;
3038
3039                         workspace_mounted = true;
3040                 }
3041         } else
3042                 final_mounted = false;
3043
3044         if (workspace_mounted < 0) {
3045                 /* Nothing is mounted on the workspace yet, let's try to mount something now */
3046                 for (int try = 0;; try++) {
3047
3048                         if (try == 0) {
3049                                 /* Try "ramfs" first, since it's not swap backed */
3050                                 r = mount_nofollow_verbose(LOG_DEBUG, "ramfs", workspace, "ramfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, "mode=0700");
3051                                 if (r >= 0) {
3052                                         workspace_mounted = true;
3053                                         break;
3054                                 }
3055
3056                         } else if (try == 1) {
3057                                 _cleanup_free_ char *opts = NULL;
3058
3059                                 if (asprintf(&opts, "mode=0700,nr_inodes=1024,size=%zu", (size_t) CREDENTIALS_TOTAL_SIZE_MAX) < 0)
3060                                         return -ENOMEM;
3061
3062                                 /* Fall back to "tmpfs" otherwise */
3063                                 r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", workspace, "tmpfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, opts);
3064                                 if (r >= 0) {
3065                                         workspace_mounted = true;
3066                                         break;
3067                                 }
3068
3069                         } else {
3070                                 /* If that didn't work, try to make a bind mount from the final to the workspace, so that we can make it writable there. */
3071                                 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
3072                                 if (r < 0) {
3073                                         if (!ERRNO_IS_PRIVILEGE(r)) /* Propagate anything that isn't a permission problem */
3074                                                 return r;
3075
3076                                         if (must_mount) /* If we it's not OK to use the plain directory
3077                                                          * fallback, propagate all errors too */
3078                                                 return r;
3079
3080                                         /* If we lack privileges to bind mount stuff, then let's gracefully
3081                                          * proceed for compat with container envs, and just use the final dir
3082                                          * as is. */
3083
3084                                         workspace_mounted = false;
3085                                         break;
3086                                 }
3087
3088                                 /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
3089                                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
3090                                 if (r < 0)
3091                                         return r;
3092
3093                                 workspace_mounted = true;
3094                                 break;
3095                         }
3096                 }
3097         }
3098
3099         assert(!must_mount || workspace_mounted > 0);
3100         where = workspace_mounted ? workspace : final;
3101
3102         (void) label_fix_full(AT_FDCWD, where, final, 0);
3103
3104         r = acquire_credentials(context, params, unit, where, uid, workspace_mounted);
3105         if (r < 0)
3106                 return r;
3107
3108         if (workspace_mounted) {
3109                 /* Make workspace read-only now, so that any bind mount we make from it defaults to read-only too */
3110                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
3111                 if (r < 0)
3112                         return r;
3113
3114                 /* And mount it to the final place, read-only */
3115                 if (final_mounted)
3116                         r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
3117                 else
3118                         r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
3119                 if (r < 0)
3120                         return r;
3121         } else {
3122                 _cleanup_free_ char *parent = NULL;
3123
3124                 /* If we do not have our own mount put used the plain directory fallback, then we need to
3125                  * open access to the top-level credential directory and the per-service directory now */
3126
3127                 r = path_extract_directory(final, &parent);
3128                 if (r < 0)
3129                         return r;
3130                 if (chmod(parent, 0755) < 0)
3131                         return -errno;
3132         }
3133
3134         return 0;
3135 }
3136
3137 static int setup_credentials(
3138                 const ExecContext *context,
3139                 const ExecParameters *params,
3140                 const char *unit,
3141                 uid_t uid) {
3142
3143         _cleanup_free_ char *p = NULL, *q = NULL;
3144         int r;
3145
3146         assert(context);
3147         assert(params);
3148
3149         if (!exec_context_has_credentials(context))
3150                 return 0;
3151
3152         if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
3153                 return -EINVAL;
3154
3155         /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
3156          * and the subdir we mount over with a read-only file system readable by the service's user */
3157         q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials");
3158         if (!q)
3159                 return -ENOMEM;
3160
3161         r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */
3162         if (r < 0 && r != -EEXIST)
3163                 return r;
3164
3165         p = path_join(q, unit);
3166         if (!p)
3167                 return -ENOMEM;
3168
3169         r = mkdir_label(p, 0700); /* per-unit dir: private to user */
3170         if (r < 0 && r != -EEXIST)
3171                 return r;
3172
3173         r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
3174         if (r < 0) {
3175                 _cleanup_free_ char *t = NULL, *u = NULL;
3176
3177                 /* If this is not a privilege or support issue then propagate the error */
3178                 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
3179                         return r;
3180
3181                 /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
3182                  * it into place, so that users can't access half-initialized credential stores. */
3183                 t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
3184                 if (!t)
3185                         return -ENOMEM;
3186
3187                 /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
3188                  * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
3189                  * after it is fully set up */
3190                 u = path_join(t, unit);
3191                 if (!u)
3192                         return -ENOMEM;
3193
3194                 FOREACH_STRING(i, t, u) {
3195                         r = mkdir_label(i, 0700);
3196                         if (r < 0 && r != -EEXIST)
3197                                 return r;
3198                 }
3199
3200                 r = setup_credentials_internal(
3201                                 context,
3202                                 params,
3203                                 unit,
3204                                 p,       /* final mount point */
3205                                 u,       /* temporary workspace to overmount */
3206                                 true,    /* reuse the workspace if it is already a mount */
3207                                 false,   /* it's OK to fall back to a plain directory if we can't mount anything */
3208                                 uid);
3209
3210                 (void) rmdir(u); /* remove the workspace again if we can. */
3211
3212                 if (r < 0)
3213                         return r;
3214
3215         } else if (r == 0) {
3216
3217                 /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
3218                  * we can use the same directory for all cases, after turning off propagation. Question
3219                  * though is: where do we turn off propagation exactly, and where do we place the workspace
3220                  * directory? We need some place that is guaranteed to be a mount point in the host, and
3221                  * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
3222                  * since we ultimately want to move the resulting file system there, i.e. we need propagation
3223                  * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
3224                  * would be visible in the host mount table all the time, which we want to avoid. Hence, what
3225                  * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
3226                  * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
3227                  * propagation on the former, and then overmount the latter.
3228                  *
3229                  * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
3230                  * for this purpose, but there are few other candidates that work equally well for us, and
3231                  * given that the we do this in a privately namespaced short-lived single-threaded process
3232                  * that no one else sees this should be OK to do. */
3233
3234                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL); /* Turn off propagation from our namespace to host */
3235                 if (r < 0)
3236                         goto child_fail;
3237
3238                 r = setup_credentials_internal(
3239                                 context,
3240                                 params,
3241                                 unit,
3242                                 p,           /* final mount point */
3243                                 "/dev/shm",  /* temporary workspace to overmount */
3244                                 false,       /* do not reuse /dev/shm if it is already a mount, under no circumstances */
3245                                 true,        /* insist that something is mounted, do not allow fallback to plain directory */
3246                                 uid);
3247                 if (r < 0)
3248                         goto child_fail;
3249
3250                 _exit(EXIT_SUCCESS);
3251
3252         child_fail:
3253                 _exit(EXIT_FAILURE);
3254         }
3255
3256         return 0;
3257 }
3258
3259 #if ENABLE_SMACK
3260 static int setup_smack(
3261                 const Manager *manager,
3262                 const ExecContext *context,
3263                 int executable_fd) {
3264         int r;
3265
3266         assert(context);
3267         assert(executable_fd >= 0);
3268
3269         if (context->smack_process_label) {
3270                 r = mac_smack_apply_pid(0, context->smack_process_label);
3271                 if (r < 0)
3272                         return r;
3273         } else if (manager->default_smack_process_label) {
3274                 _cleanup_free_ char *exec_label = NULL;
3275
3276                 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
3277                 if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
3278                         return r;
3279
3280                 r = mac_smack_apply_pid(0, exec_label ? : manager->default_smack_process_label);
3281                 if (r < 0)
3282                         return r;
3283         }
3284
3285         return 0;
3286 }
3287 #endif
3288
3289 static int compile_bind_mounts(
3290                 const ExecContext *context,
3291                 const ExecParameters *params,
3292                 BindMount **ret_bind_mounts,
3293                 size_t *ret_n_bind_mounts,
3294                 char ***ret_empty_directories) {
3295
3296         _cleanup_strv_free_ char **empty_directories = NULL;
3297         BindMount *bind_mounts;
3298         size_t n, h = 0;
3299         int r;
3300
3301         assert(context);
3302         assert(params);
3303         assert(ret_bind_mounts);
3304         assert(ret_n_bind_mounts);
3305         assert(ret_empty_directories);
3306
3307         n = context->n_bind_mounts;
3308         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3309                 if (!params->prefix[t])
3310                         continue;
3311
3312                 for (size_t i = 0; i < context->directories[t].n_items; i++)
3313                         n += !context->directories[t].items[i].only_create;
3314         }
3315
3316         if (n <= 0) {
3317                 *ret_bind_mounts = NULL;
3318                 *ret_n_bind_mounts = 0;
3319                 *ret_empty_directories = NULL;
3320                 return 0;
3321         }
3322
3323         bind_mounts = new(BindMount, n);
3324         if (!bind_mounts)
3325                 return -ENOMEM;
3326
3327         for (size_t i = 0; i < context->n_bind_mounts; i++) {
3328                 BindMount *item = context->bind_mounts + i;
3329                 char *s, *d;
3330
3331                 s = strdup(item->source);
3332                 if (!s) {
3333                         r = -ENOMEM;
3334                         goto finish;
3335                 }
3336
3337                 d = strdup(item->destination);
3338                 if (!d) {
3339                         free(s);
3340                         r = -ENOMEM;
3341                         goto finish;
3342                 }
3343
3344                 bind_mounts[h++] = (BindMount) {
3345                         .source = s,
3346                         .destination = d,
3347                         .read_only = item->read_only,
3348                         .recursive = item->recursive,
3349                         .ignore_enoent = item->ignore_enoent,
3350                 };
3351         }
3352
3353         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3354                 if (!params->prefix[t])
3355                         continue;
3356
3357                 if (context->directories[t].n_items == 0)
3358                         continue;
3359
3360                 if (exec_directory_is_private(context, t) &&
3361                     !exec_context_with_rootfs(context)) {
3362                         char *private_root;
3363
3364                         /* So this is for a dynamic user, and we need to make sure the process can access its own
3365                          * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3366                          * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3367
3368                         private_root = path_join(params->prefix[t], "private");
3369                         if (!private_root) {
3370                                 r = -ENOMEM;
3371                                 goto finish;
3372                         }
3373
3374                         r = strv_consume(&empty_directories, private_root);
3375                         if (r < 0)
3376                                 goto finish;
3377                 }
3378
3379                 for (size_t i = 0; i < context->directories[t].n_items; i++) {
3380                         char *s, *d;
3381
3382                         /* When one of the parent directories is in the list, we cannot create the symlink
3383                          * for the child directory. See also the comments in setup_exec_directory(). */
3384                         if (context->directories[t].items[i].only_create)
3385                                 continue;
3386
3387                         if (exec_directory_is_private(context, t))
3388                                 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
3389                         else
3390                                 s = path_join(params->prefix[t], context->directories[t].items[i].path);
3391                         if (!s) {
3392                                 r = -ENOMEM;
3393                                 goto finish;
3394                         }
3395
3396                         if (exec_directory_is_private(context, t) &&
3397                             exec_context_with_rootfs(context))
3398                                 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3399                                  * directory is not created on the root directory. So, let's bind-mount the directory
3400                                  * on the 'non-private' place. */
3401                                 d = path_join(params->prefix[t], context->directories[t].items[i].path);
3402                         else
3403                                 d = strdup(s);
3404                         if (!d) {
3405                                 free(s);
3406                                 r = -ENOMEM;
3407                                 goto finish;
3408                         }
3409
3410                         bind_mounts[h++] = (BindMount) {
3411                                 .source = s,
3412                                 .destination = d,
3413                                 .read_only = false,
3414                                 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
3415                                 .recursive = true,
3416                                 .ignore_enoent = false,
3417                         };
3418                 }
3419         }
3420
3421         assert(h == n);
3422
3423         *ret_bind_mounts = bind_mounts;
3424         *ret_n_bind_mounts = n;
3425         *ret_empty_directories = TAKE_PTR(empty_directories);
3426
3427         return (int) n;
3428
3429 finish:
3430         bind_mount_free_many(bind_mounts, h);
3431         return r;
3432 }
3433
3434 /* ret_symlinks will contain a list of pairs src:dest that describes
3435  * the symlinks to create later on. For example, the symlinks needed
3436  * to safely give private directories to DynamicUser=1 users. */
3437 static int compile_symlinks(
3438                 const ExecContext *context,
3439                 const ExecParameters *params,
3440                 char ***ret_symlinks) {
3441
3442         _cleanup_strv_free_ char **symlinks = NULL;
3443         int r;
3444
3445         assert(context);
3446         assert(params);
3447         assert(ret_symlinks);
3448
3449         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3450                 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
3451                         _cleanup_free_ char *private_path = NULL, *path = NULL;
3452
3453                         STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
3454                                 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
3455
3456                                 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3457                                 dst_abs = path_join(params->prefix[dt], *symlink);
3458                                 if (!src_abs || !dst_abs)
3459                                         return -ENOMEM;
3460
3461                                 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
3462                                 if (r < 0)
3463                                         return r;
3464                         }
3465
3466                         if (!exec_directory_is_private(context, dt) ||
3467                             exec_context_with_rootfs(context) ||
3468                             context->directories[dt].items[i].only_create)
3469                                 continue;
3470
3471                         private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
3472                         if (!private_path)
3473                                 return -ENOMEM;
3474
3475                         path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3476                         if (!path)
3477                                 return -ENOMEM;
3478
3479                         r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
3480                         if (r < 0)
3481                                 return r;
3482                 }
3483         }
3484
3485         *ret_symlinks = TAKE_PTR(symlinks);
3486
3487         return 0;
3488 }
3489
3490 static bool insist_on_sandboxing(
3491                 const ExecContext *context,
3492                 const char *root_dir,
3493                 const char *root_image,
3494                 const BindMount *bind_mounts,
3495                 size_t n_bind_mounts) {
3496
3497         assert(context);
3498         assert(n_bind_mounts == 0 || bind_mounts);
3499
3500         /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
3501          * would alter the view on the file system beyond making things read-only or invisible, i.e. would
3502          * rearrange stuff in a way we cannot ignore gracefully. */
3503
3504         if (context->n_temporary_filesystems > 0)
3505                 return true;
3506
3507         if (root_dir || root_image)
3508                 return true;
3509
3510         if (context->n_mount_images > 0)
3511                 return true;
3512
3513         if (context->dynamic_user)
3514                 return true;
3515
3516         if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
3517                 return true;
3518
3519         /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3520          * essential. */
3521         for (size_t i = 0; i < n_bind_mounts; i++)
3522                 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
3523                         return true;
3524
3525         if (context->log_namespace)
3526                 return true;
3527
3528         return false;
3529 }
3530
3531 static int apply_mount_namespace(
3532                 const Unit *u,
3533                 ExecCommandFlags command_flags,
3534                 const ExecContext *context,
3535                 const ExecParameters *params,
3536                 const ExecRuntime *runtime,
3537                 char **error_path) {
3538
3539         _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL;
3540         const char *tmp_dir = NULL, *var_tmp_dir = NULL;
3541         const char *root_dir = NULL, *root_image = NULL;
3542         _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
3543                         *extension_dir = NULL;
3544         NamespaceInfo ns_info;
3545         bool needs_sandboxing;
3546         BindMount *bind_mounts = NULL;
3547         size_t n_bind_mounts = 0;
3548         int r;
3549
3550         assert(context);
3551
3552         if (params->flags & EXEC_APPLY_CHROOT) {
3553                 root_image = context->root_image;
3554
3555                 if (!root_image)
3556                         root_dir = context->root_directory;
3557         }
3558
3559         r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3560         if (r < 0)
3561                 return r;
3562
3563         /* Symlinks for exec dirs are set up after other mounts, before they are made read-only. */
3564         r = compile_symlinks(context, params, &symlinks);
3565         if (r < 0)
3566                 goto finalize;
3567
3568         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3569         if (needs_sandboxing) {
3570                 /* The runtime struct only contains the parent of the private /tmp,
3571                  * which is non-accessible to world users. Inside of it there's a /tmp
3572                  * that is sticky, and that's the one we want to use here.
3573                  * This does not apply when we are using /run/systemd/empty as fallback. */
3574
3575                 if (context->private_tmp && runtime) {
3576                         if (streq_ptr(runtime->tmp_dir, RUN_SYSTEMD_EMPTY))
3577                                 tmp_dir = runtime->tmp_dir;
3578                         else if (runtime->tmp_dir)
3579                                 tmp_dir = strjoina(runtime->tmp_dir, "/tmp");
3580
3581                         if (streq_ptr(runtime->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3582                                 var_tmp_dir = runtime->var_tmp_dir;
3583                         else if (runtime->var_tmp_dir)
3584                                 var_tmp_dir = strjoina(runtime->var_tmp_dir, "/tmp");
3585                 }
3586
3587                 ns_info = (NamespaceInfo) {
3588                         .ignore_protect_paths = false,
3589                         .private_dev = context->private_devices,
3590                         .protect_control_groups = context->protect_control_groups,
3591                         .protect_kernel_tunables = context->protect_kernel_tunables,
3592                         .protect_kernel_modules = context->protect_kernel_modules,
3593                         .protect_kernel_logs = context->protect_kernel_logs,
3594                         .protect_hostname = context->protect_hostname,
3595                         .mount_apivfs = exec_context_get_effective_mount_apivfs(context),
3596                         .private_mounts = context->private_mounts,
3597                         .protect_home = context->protect_home,
3598                         .protect_system = context->protect_system,
3599                         .protect_proc = context->protect_proc,
3600                         .proc_subset = context->proc_subset,
3601                         .private_ipc = context->private_ipc || context->ipc_namespace_path,
3602                         /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
3603                         .mount_nosuid = context->no_new_privileges && !mac_selinux_use(),
3604                 };
3605         } else if (!context->dynamic_user && root_dir)
3606                 /*
3607                  * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
3608                  * sandbox info, otherwise enforce it, don't ignore protected paths and
3609                  * fail if we are enable to apply the sandbox inside the mount namespace.
3610                  */
3611                 ns_info = (NamespaceInfo) {
3612                         .ignore_protect_paths = true,
3613                 };
3614         else
3615                 ns_info = (NamespaceInfo) {};
3616
3617         if (context->mount_flags == MS_SHARED)
3618                 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3619
3620         if (exec_context_has_credentials(context) &&
3621             params->prefix[EXEC_DIRECTORY_RUNTIME] &&
3622             FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
3623                 creds_path = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials", u->id);
3624                 if (!creds_path) {
3625                         r = -ENOMEM;
3626                         goto finalize;
3627                 }
3628         }
3629
3630         if (MANAGER_IS_SYSTEM(u->manager)) {
3631                 propagate_dir = path_join("/run/systemd/propagate/", u->id);
3632                 if (!propagate_dir) {
3633                         r = -ENOMEM;
3634                         goto finalize;
3635                 }
3636
3637                 incoming_dir = strdup("/run/systemd/incoming");
3638                 if (!incoming_dir) {
3639                         r = -ENOMEM;
3640                         goto finalize;
3641                 }
3642
3643                 extension_dir = strdup("/run/systemd/unit-extensions");
3644                 if (!extension_dir) {
3645                         r = -ENOMEM;
3646                         goto finalize;
3647                 }
3648         } else
3649                 if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0) {
3650                         r = -ENOMEM;
3651                         goto finalize;
3652                 }
3653
3654         r = setup_namespace(root_dir, root_image, context->root_image_options,
3655                             &ns_info, context->read_write_paths,
3656                             needs_sandboxing ? context->read_only_paths : NULL,
3657                             needs_sandboxing ? context->inaccessible_paths : NULL,
3658                             needs_sandboxing ? context->exec_paths : NULL,
3659                             needs_sandboxing ? context->no_exec_paths : NULL,
3660                             empty_directories,
3661                             symlinks,
3662                             bind_mounts,
3663                             n_bind_mounts,
3664                             context->temporary_filesystems,
3665                             context->n_temporary_filesystems,
3666                             context->mount_images,
3667                             context->n_mount_images,
3668                             tmp_dir,
3669                             var_tmp_dir,
3670                             creds_path,
3671                             context->log_namespace,
3672                             context->mount_flags,
3673                             context->root_hash, context->root_hash_size, context->root_hash_path,
3674                             context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3675                             context->root_verity,
3676                             context->extension_images,
3677                             context->n_extension_images,
3678                             context->extension_directories,
3679                             propagate_dir,
3680                             incoming_dir,
3681                             extension_dir,
3682                             root_dir || root_image ? params->notify_socket : NULL,
3683                             error_path);
3684
3685         /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
3686          * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
3687          * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3688          * completely different execution environment. */
3689         if (r == -ENOANO) {
3690                 if (insist_on_sandboxing(
3691                                     context,
3692                                     root_dir, root_image,
3693                                     bind_mounts,
3694                                     n_bind_mounts)) {
3695                         log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
3696                                        "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3697                                        n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
3698
3699                         r = -EOPNOTSUPP;
3700                 } else {
3701                         log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
3702                         r = 0;
3703                 }
3704         }
3705
3706 finalize:
3707         bind_mount_free_many(bind_mounts, n_bind_mounts);
3708         return r;
3709 }
3710
3711 static int apply_working_directory(
3712                 const ExecContext *context,
3713                 const ExecParameters *params,
3714                 const char *home,
3715                 int *exit_status) {
3716
3717         const char *d, *wd;
3718
3719         assert(context);
3720         assert(exit_status);
3721
3722         if (context->working_directory_home) {
3723
3724                 if (!home) {
3725                         *exit_status = EXIT_CHDIR;
3726                         return -ENXIO;
3727                 }
3728
3729                 wd = home;
3730
3731         } else
3732                 wd = empty_to_root(context->working_directory);
3733
3734         if (params->flags & EXEC_APPLY_CHROOT)
3735                 d = wd;
3736         else
3737                 d = prefix_roota(context->root_directory, wd);
3738
3739         if (chdir(d) < 0 && !context->working_directory_missing_ok) {
3740                 *exit_status = EXIT_CHDIR;
3741                 return -errno;
3742         }
3743
3744         return 0;
3745 }
3746
3747 static int apply_root_directory(
3748                 const ExecContext *context,
3749                 const ExecParameters *params,
3750                 const bool needs_mount_ns,
3751                 int *exit_status) {
3752
3753         assert(context);
3754         assert(exit_status);
3755
3756         if (params->flags & EXEC_APPLY_CHROOT)
3757                 if (!needs_mount_ns && context->root_directory)
3758                         if (chroot(context->root_directory) < 0) {
3759                                 *exit_status = EXIT_CHROOT;
3760                                 return -errno;
3761                         }
3762
3763         return 0;
3764 }
3765
3766 static int setup_keyring(
3767                 const Unit *u,
3768                 const ExecContext *context,
3769                 const ExecParameters *p,
3770                 uid_t uid, gid_t gid) {
3771
3772         key_serial_t keyring;
3773         int r = 0;
3774         uid_t saved_uid;
3775         gid_t saved_gid;
3776
3777         assert(u);
3778         assert(context);
3779         assert(p);
3780
3781         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3782          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3783          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3784          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3785          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3786          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3787
3788         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3789                 return 0;
3790
3791         /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3792          * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3793          * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3794          * & group is just as nasty as acquiring a reference to the user keyring. */
3795
3796         saved_uid = getuid();
3797         saved_gid = getgid();
3798
3799         if (gid_is_valid(gid) && gid != saved_gid) {
3800                 if (setregid(gid, -1) < 0)
3801                         return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
3802         }
3803
3804         if (uid_is_valid(uid) && uid != saved_uid) {
3805                 if (setreuid(uid, -1) < 0) {
3806                         r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
3807                         goto out;
3808                 }
3809         }
3810
3811         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3812         if (keyring == -1) {
3813                 if (errno == ENOSYS)
3814                         log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
3815                 else if (ERRNO_IS_PRIVILEGE(errno))
3816                         log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
3817                 else if (errno == EDQUOT)
3818                         log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
3819                 else
3820                         r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
3821
3822                 goto out;
3823         }
3824
3825         /* When requested link the user keyring into the session keyring. */
3826         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3827
3828                 if (keyctl(KEYCTL_LINK,
3829                            KEY_SPEC_USER_KEYRING,
3830                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3831                         r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
3832                         goto out;
3833                 }
3834         }
3835
3836         /* Restore uid/gid back */
3837         if (uid_is_valid(uid) && uid != saved_uid) {
3838                 if (setreuid(saved_uid, -1) < 0) {
3839                         r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
3840                         goto out;
3841                 }
3842         }
3843
3844         if (gid_is_valid(gid) && gid != saved_gid) {
3845                 if (setregid(saved_gid, -1) < 0)
3846                         return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
3847         }
3848
3849         /* Populate they keyring with the invocation ID by default, as original saved_uid. */
3850         if (!sd_id128_is_null(u->invocation_id)) {
3851                 key_serial_t key;
3852
3853                 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
3854                 if (key == -1)
3855                         log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
3856                 else {
3857                         if (keyctl(KEYCTL_SETPERM, key,
3858                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3859                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
3860                                 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
3861                 }
3862         }
3863
3864 out:
3865         /* Revert back uid & gid for the last time, and exit */
3866         /* no extra logging, as only the first already reported error matters */
3867         if (getuid() != saved_uid)
3868                 (void) setreuid(saved_uid, -1);
3869
3870         if (getgid() != saved_gid)
3871                 (void) setregid(saved_gid, -1);
3872
3873         return r;
3874 }
3875
3876 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
3877         assert(array);
3878         assert(n);
3879         assert(pair);
3880
3881         if (pair[0] >= 0)
3882                 array[(*n)++] = pair[0];
3883         if (pair[1] >= 0)
3884                 array[(*n)++] = pair[1];
3885 }
3886
3887 static int close_remaining_fds(
3888                 const ExecParameters *params,
3889                 const ExecRuntime *runtime,
3890                 const DynamicCreds *dcreds,
3891                 int user_lookup_fd,
3892                 int socket_fd,
3893                 const int *fds, size_t n_fds) {
3894
3895         size_t n_dont_close = 0;
3896         int dont_close[n_fds + 12];
3897
3898         assert(params);
3899
3900         if (params->stdin_fd >= 0)
3901                 dont_close[n_dont_close++] = params->stdin_fd;
3902         if (params->stdout_fd >= 0)
3903                 dont_close[n_dont_close++] = params->stdout_fd;
3904         if (params->stderr_fd >= 0)
3905                 dont_close[n_dont_close++] = params->stderr_fd;
3906
3907         if (socket_fd >= 0)
3908                 dont_close[n_dont_close++] = socket_fd;
3909         if (n_fds > 0) {
3910                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3911                 n_dont_close += n_fds;
3912         }
3913
3914         if (runtime) {
3915                 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
3916                 append_socket_pair(dont_close, &n_dont_close, runtime->ipcns_storage_socket);
3917         }
3918
3919         if (dcreds) {
3920                 if (dcreds->user)
3921                         append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
3922                 if (dcreds->group)
3923                         append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
3924         }
3925
3926         if (user_lookup_fd >= 0)
3927                 dont_close[n_dont_close++] = user_lookup_fd;
3928
3929         return close_all_fds(dont_close, n_dont_close);
3930 }
3931
3932 static int send_user_lookup(
3933                 Unit *unit,
3934                 int user_lookup_fd,
3935                 uid_t uid,
3936                 gid_t gid) {
3937
3938         assert(unit);
3939
3940         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3941          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3942          * specified. */
3943
3944         if (user_lookup_fd < 0)
3945                 return 0;
3946
3947         if (!uid_is_valid(uid) && !gid_is_valid(gid))
3948                 return 0;
3949
3950         if (writev(user_lookup_fd,
3951                (struct iovec[]) {
3952                            IOVEC_INIT(&uid, sizeof(uid)),
3953                            IOVEC_INIT(&gid, sizeof(gid)),
3954                            IOVEC_INIT_STRING(unit->id) }, 3) < 0)
3955                 return -errno;
3956
3957         return 0;
3958 }
3959
3960 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3961         int r;
3962
3963         assert(c);
3964         assert(home);
3965         assert(buf);
3966
3967         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3968
3969         if (*home)
3970                 return 0;
3971
3972         if (!c->working_directory_home)
3973                 return 0;
3974
3975         r = get_home_dir(buf);
3976         if (r < 0)
3977                 return r;
3978
3979         *home = *buf;
3980         return 1;
3981 }
3982
3983 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3984         _cleanup_strv_free_ char ** list = NULL;
3985         int r;
3986
3987         assert(c);
3988         assert(p);
3989         assert(ret);
3990
3991         assert(c->dynamic_user);
3992
3993         /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3994          * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3995          * directories. */
3996
3997         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3998                 if (t == EXEC_DIRECTORY_CONFIGURATION)
3999                         continue;
4000
4001                 if (!p->prefix[t])
4002                         continue;
4003
4004                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
4005                         char *e;
4006
4007                         if (exec_directory_is_private(c, t))
4008                                 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
4009                         else
4010                                 e = path_join(p->prefix[t], c->directories[t].items[i].path);
4011                         if (!e)
4012                                 return -ENOMEM;
4013
4014                         r = strv_consume(&list, e);
4015                         if (r < 0)
4016                                 return r;
4017                 }
4018         }
4019
4020         *ret = TAKE_PTR(list);
4021
4022         return 0;
4023 }
4024
4025 static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
4026         bool using_subcgroup;
4027         char *p;
4028
4029         assert(params);
4030         assert(ret);
4031
4032         if (!params->cgroup_path)
4033                 return -EINVAL;
4034
4035         /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
4036          * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
4037          * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
4038          * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
4039          * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
4040          * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
4041          * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
4042          * flag, which is only passed for the former statements, not for the latter. */
4043
4044         using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
4045         if (using_subcgroup)
4046                 p = path_join(params->cgroup_path, ".control");
4047         else
4048                 p = strdup(params->cgroup_path);
4049         if (!p)
4050                 return -ENOMEM;
4051
4052         *ret = p;
4053         return using_subcgroup;
4054 }
4055
4056 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
4057         _cleanup_(cpu_set_reset) CPUSet s = {};
4058         int r;
4059
4060         assert(c);
4061         assert(ret);
4062
4063         if (!c->numa_policy.nodes.set) {
4064                 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
4065                 return 0;
4066         }
4067
4068         r = numa_to_cpu_set(&c->numa_policy, &s);
4069         if (r < 0)
4070                 return r;
4071
4072         cpu_set_reset(ret);
4073
4074         return cpu_set_add_all(ret, &s);
4075 }
4076
4077 bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
4078         assert(c);
4079
4080         return c->cpu_affinity_from_numa;
4081 }
4082
4083 static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
4084         int r;
4085
4086         assert(fds);
4087         assert(n_fds);
4088         assert(*n_fds < fds_size);
4089         assert(ret_fd);
4090
4091         if (fd < 0) {
4092                 *ret_fd = -EBADF;
4093                 return 0;
4094         }
4095
4096         if (fd < 3 + (int) *n_fds) {
4097                 /* Let's move the fd up, so that it's outside of the fd range we will use to store
4098                  * the fds we pass to the process (or which are closed only during execve). */
4099
4100                 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
4101                 if (r < 0)
4102                         return -errno;
4103
4104                 close_and_replace(fd, r);
4105         }
4106
4107         *ret_fd = fds[*n_fds] = fd;
4108         (*n_fds) ++;
4109         return 1;
4110 }
4111
4112 static int connect_unix_harder(Unit *u, const OpenFile *of, int ofd) {
4113         union sockaddr_union addr = {
4114                 .un.sun_family = AF_UNIX,
4115         };
4116         socklen_t sa_len;
4117         static const int socket_types[] = { SOCK_DGRAM, SOCK_STREAM, SOCK_SEQPACKET };
4118         int r;
4119
4120         assert(u);
4121         assert(of);
4122         assert(ofd >= 0);
4123
4124         r = sockaddr_un_set_path(&addr.un, FORMAT_PROC_FD_PATH(ofd));
4125         if (r < 0)
4126                 return log_unit_error_errno(u, r, "Failed to set sockaddr for %s: %m", of->path);
4127
4128         sa_len = r;
4129
4130         for (size_t i = 0; i < ELEMENTSOF(socket_types); i++) {
4131                 _cleanup_close_ int fd = -EBADF;
4132
4133                 fd = socket(AF_UNIX, socket_types[i] | SOCK_CLOEXEC, 0);
4134                 if (fd < 0)
4135                         return log_unit_error_errno(u, errno, "Failed to create socket for %s: %m", of->path);
4136
4137                 r = RET_NERRNO(connect(fd, &addr.sa, sa_len));
4138                 if (r == -EPROTOTYPE)
4139                         continue;
4140                 if (r < 0)
4141                         return log_unit_error_errno(u, r, "Failed to connect socket for %s: %m", of->path);
4142
4143                 return TAKE_FD(fd);
4144         }
4145
4146         return log_unit_error_errno(u, SYNTHETIC_ERRNO(EPROTOTYPE), "Failed to connect socket for \"%s\".", of->path);
4147 }
4148
4149 static int get_open_file_fd(Unit *u, const OpenFile *of) {
4150         struct stat st;
4151         _cleanup_close_ int fd = -EBADF, ofd = -EBADF;
4152
4153         assert(u);
4154         assert(of);
4155
4156         ofd = open(of->path, O_PATH | O_CLOEXEC);
4157         if (ofd < 0)
4158                 return log_error_errno(errno, "Could not open \"%s\": %m", of->path);
4159         if (fstat(ofd, &st) < 0)
4160                 return log_error_errno(errno, "Failed to stat %s: %m", of->path);
4161
4162         if (S_ISSOCK(st.st_mode)) {
4163                 fd = connect_unix_harder(u, of, ofd);
4164                 if (fd < 0)
4165                         return fd;
4166
4167                 if (FLAGS_SET(of->flags, OPENFILE_READ_ONLY) && shutdown(fd, SHUT_WR) < 0)
4168                         return log_error_errno(errno, "Failed to shutdown send for socket %s: %m", of->path);
4169
4170                 log_unit_debug(u, "socket %s opened (fd=%d)", of->path, fd);
4171         } else {
4172                 int flags = FLAGS_SET(of->flags, OPENFILE_READ_ONLY) ? O_RDONLY : O_RDWR;
4173                 if (FLAGS_SET(of->flags, OPENFILE_APPEND))
4174                         flags |= O_APPEND;
4175                 else if (FLAGS_SET(of->flags, OPENFILE_TRUNCATE))
4176                         flags |= O_TRUNC;
4177
4178                 fd = fd_reopen(ofd, flags | O_CLOEXEC);
4179                 if (fd < 0)
4180                         return log_unit_error_errno(u, fd, "Failed to open file %s: %m", of->path);
4181
4182                 log_unit_debug(u, "file %s opened (fd=%d)", of->path, fd);
4183         }
4184
4185         return TAKE_FD(fd);
4186 }
4187
4188 static int collect_open_file_fds(
4189                 Unit *u,
4190                 OpenFile* open_files,
4191                 int **fds,
4192                 char ***fdnames,
4193                 size_t *n_fds) {
4194         int r;
4195
4196         assert(u);
4197         assert(fds);
4198         assert(fdnames);
4199         assert(n_fds);
4200
4201         LIST_FOREACH(open_files, of, open_files) {
4202                 _cleanup_close_ int fd = -EBADF;
4203
4204                 fd = get_open_file_fd(u, of);
4205                 if (fd < 0) {
4206                         if (FLAGS_SET(of->flags, OPENFILE_GRACEFUL)) {
4207                                 log_unit_debug_errno(u, fd, "Failed to get OpenFile= file descriptor for %s, ignoring: %m", of->path);
4208                                 continue;
4209                         }
4210
4211                         return fd;
4212                 }
4213
4214                 if (!GREEDY_REALLOC(*fds, *n_fds + 1))
4215                         return -ENOMEM;
4216
4217                 r = strv_extend(fdnames, of->fdname);
4218                 if (r < 0)
4219                         return r;
4220
4221                 (*fds)[*n_fds] = TAKE_FD(fd);
4222
4223                 (*n_fds)++;
4224         }
4225
4226         return 0;
4227 }
4228
4229 static int exec_child(
4230                 Unit *unit,
4231                 const ExecCommand *command,
4232                 const ExecContext *context,
4233                 const ExecParameters *params,
4234                 ExecRuntime *runtime,
4235                 DynamicCreds *dcreds,
4236                 int socket_fd,
4237                 const int named_iofds[static 3],
4238                 int *params_fds,
4239                 size_t n_socket_fds,
4240                 size_t n_storage_fds,
4241                 char **files_env,
4242                 int user_lookup_fd,
4243                 int *exit_status) {
4244
4245         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
4246         int r, ngids = 0, exec_fd;
4247         _cleanup_free_ gid_t *supplementary_gids = NULL;
4248         const char *username = NULL, *groupname = NULL;
4249         _cleanup_free_ char *home_buffer = NULL;
4250         const char *home = NULL, *shell = NULL;
4251         char **final_argv = NULL;
4252         dev_t journal_stream_dev = 0;
4253         ino_t journal_stream_ino = 0;
4254         bool userns_set_up = false;
4255         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
4256                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
4257                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
4258                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
4259 #if HAVE_SELINUX
4260         _cleanup_free_ char *mac_selinux_context_net = NULL;
4261         bool use_selinux = false;
4262 #endif
4263 #if ENABLE_SMACK
4264         bool use_smack = false;
4265 #endif
4266 #if HAVE_APPARMOR
4267         bool use_apparmor = false;
4268 #endif
4269         uid_t saved_uid = getuid();
4270         gid_t saved_gid = getgid();
4271         uid_t uid = UID_INVALID;
4272         gid_t gid = GID_INVALID;
4273         size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
4274                n_keep_fds; /* total number of fds not to close */
4275         int secure_bits;
4276         _cleanup_free_ gid_t *gids_after_pam = NULL;
4277         int ngids_after_pam = 0;
4278         _cleanup_free_ int *fds = NULL;
4279         _cleanup_strv_free_ char **fdnames = NULL;
4280
4281         assert(unit);
4282         assert(command);
4283         assert(context);
4284         assert(params);
4285         assert(exit_status);
4286
4287         /* Explicitly test for CVE-2021-4034 inspired invocations */
4288         assert(command->path);
4289         assert(!strv_isempty(command->argv));
4290
4291         rename_process_from_path(command->path);
4292
4293         /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4294          * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4295          * both of which will be demoted to SIG_DFL. */
4296         (void) default_signals(SIGNALS_CRASH_HANDLER,
4297                                SIGNALS_IGNORE);
4298
4299         if (context->ignore_sigpipe)
4300                 (void) ignore_signals(SIGPIPE);
4301
4302         r = reset_signal_mask();
4303         if (r < 0) {
4304                 *exit_status = EXIT_SIGNAL_MASK;
4305                 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
4306         }
4307
4308         if (params->idle_pipe)
4309                 do_idle_pipe_dance(params->idle_pipe);
4310
4311         /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4312          * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4313          * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4314          * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
4315
4316         log_forget_fds();
4317         log_set_open_when_needed(true);
4318
4319         /* In case anything used libc syslog(), close this here, too */
4320         closelog();
4321
4322         fds = newdup(int, params_fds, n_fds);
4323         if (!fds) {
4324                 *exit_status = EXIT_MEMORY;
4325                 return log_oom();
4326         }
4327
4328         fdnames = strv_copy((char**) params->fd_names);
4329         if (!fdnames) {
4330                 *exit_status = EXIT_MEMORY;
4331                 return log_oom();
4332         }
4333
4334         r = collect_open_file_fds(unit, params->open_files, &fds, &fdnames, &n_fds);
4335         if (r < 0) {
4336                 *exit_status = EXIT_FDS;
4337                 return log_unit_error_errno(unit, r, "Failed to get OpenFile= file descriptors: %m");
4338         }
4339
4340         int keep_fds[n_fds + 3];
4341         memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
4342         n_keep_fds = n_fds;
4343
4344         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
4345         if (r < 0) {
4346                 *exit_status = EXIT_FDS;
4347                 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4348         }
4349
4350 #if HAVE_LIBBPF
4351         if (unit->manager->restrict_fs) {
4352                 int bpf_map_fd = lsm_bpf_map_restrict_fs_fd(unit);
4353                 if (bpf_map_fd < 0) {
4354                         *exit_status = EXIT_FDS;
4355                         return log_unit_error_errno(unit, bpf_map_fd, "Failed to get restrict filesystems BPF map fd: %m");
4356                 }
4357
4358                 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, bpf_map_fd, &bpf_map_fd);
4359                 if (r < 0) {
4360                         *exit_status = EXIT_FDS;
4361                         return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4362                 }
4363         }
4364 #endif
4365
4366         r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
4367         if (r < 0) {
4368                 *exit_status = EXIT_FDS;
4369                 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
4370         }
4371
4372         if (!context->same_pgrp &&
4373             setsid() < 0) {
4374                 *exit_status = EXIT_SETSID;
4375                 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
4376         }
4377
4378         exec_context_tty_reset(context, params);
4379
4380         if (unit_shall_confirm_spawn(unit)) {
4381                 _cleanup_free_ char *cmdline = NULL;
4382
4383                 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
4384                 if (!cmdline) {
4385                         *exit_status = EXIT_MEMORY;
4386                         return log_oom();
4387                 }
4388
4389                 r = ask_for_confirmation(context, params->confirm_spawn, unit, cmdline);
4390                 if (r != CONFIRM_EXECUTE) {
4391                         if (r == CONFIRM_PRETEND_SUCCESS) {
4392                                 *exit_status = EXIT_SUCCESS;
4393                                 return 0;
4394                         }
4395                         *exit_status = EXIT_CONFIRM;
4396                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
4397                                                     "Execution cancelled by the user");
4398                 }
4399         }
4400
4401         /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4402          * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4403          * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4404          * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4405          * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4406         if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
4407             setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
4408                 *exit_status = EXIT_MEMORY;
4409                 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4410         }
4411
4412         if (context->dynamic_user && dcreds) {
4413                 _cleanup_strv_free_ char **suggested_paths = NULL;
4414
4415                 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
4416                  * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
4417                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4418                         *exit_status = EXIT_USER;
4419                         return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4420                 }
4421
4422                 r = compile_suggested_paths(context, params, &suggested_paths);
4423                 if (r < 0) {
4424                         *exit_status = EXIT_MEMORY;
4425                         return log_oom();
4426                 }
4427
4428                 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
4429                 if (r < 0) {
4430                         *exit_status = EXIT_USER;
4431                         if (r == -EILSEQ)
4432                                 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4433                                                             "Failed to update dynamic user credentials: User or group with specified name already exists.");
4434                         return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
4435                 }
4436
4437                 if (!uid_is_valid(uid)) {
4438                         *exit_status = EXIT_USER;
4439                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
4440                 }
4441
4442                 if (!gid_is_valid(gid)) {
4443                         *exit_status = EXIT_USER;
4444                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
4445                 }
4446
4447                 if (dcreds->user)
4448                         username = dcreds->user->name;
4449
4450         } else {
4451                 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
4452                 if (r < 0) {
4453                         *exit_status = EXIT_USER;
4454                         return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
4455                 }
4456
4457                 r = get_fixed_group(context, &groupname, &gid);
4458                 if (r < 0) {
4459                         *exit_status = EXIT_GROUP;
4460                         return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
4461                 }
4462         }
4463
4464         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4465         r = get_supplementary_groups(context, username, groupname, gid,
4466                                      &supplementary_gids, &ngids);
4467         if (r < 0) {
4468                 *exit_status = EXIT_GROUP;
4469                 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
4470         }
4471
4472         r = send_user_lookup(unit, user_lookup_fd, uid, gid);
4473         if (r < 0) {
4474                 *exit_status = EXIT_USER;
4475                 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
4476         }
4477
4478         user_lookup_fd = safe_close(user_lookup_fd);
4479
4480         r = acquire_home(context, uid, &home, &home_buffer);
4481         if (r < 0) {
4482                 *exit_status = EXIT_CHDIR;
4483                 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
4484         }
4485
4486         /* If a socket is connected to STDIN/STDOUT/STDERR, we
4487          * must sure to drop O_NONBLOCK */
4488         if (socket_fd >= 0)
4489                 (void) fd_nonblock(socket_fd, false);
4490
4491         /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4492          * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4493         if (params->cgroup_path) {
4494                 _cleanup_free_ char *p = NULL;
4495
4496                 r = exec_parameters_get_cgroup_path(params, &p);
4497                 if (r < 0) {
4498                         *exit_status = EXIT_CGROUP;
4499                         return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
4500                 }
4501
4502                 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
4503                 if (r == -EUCLEAN) {
4504                         *exit_status = EXIT_CGROUP;
4505                         return log_unit_error_errno(unit, r, "Failed to attach process to cgroup %s "
4506                                                     "because the cgroup or one of its parents or "
4507                                                     "siblings is in the threaded mode: %m", p);
4508                 }
4509                 if (r < 0) {
4510                         *exit_status = EXIT_CGROUP;
4511                         return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
4512                 }
4513         }
4514
4515         if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
4516                 r = open_shareable_ns_path(runtime->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
4517                 if (r < 0) {
4518                         *exit_status = EXIT_NETWORK;
4519                         return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
4520                 }
4521         }
4522
4523         if (context->ipc_namespace_path && runtime && runtime->ipcns_storage_socket[0] >= 0) {
4524                 r = open_shareable_ns_path(runtime->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
4525                 if (r < 0) {
4526                         *exit_status = EXIT_NAMESPACE;
4527                         return log_unit_error_errno(unit, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
4528                 }
4529         }
4530
4531         r = setup_input(context, params, socket_fd, named_iofds);
4532         if (r < 0) {
4533                 *exit_status = EXIT_STDIN;
4534                 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
4535         }
4536
4537         r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4538         if (r < 0) {
4539                 *exit_status = EXIT_STDOUT;
4540                 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
4541         }
4542
4543         r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4544         if (r < 0) {
4545                 *exit_status = EXIT_STDERR;
4546                 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
4547         }
4548
4549         if (context->oom_score_adjust_set) {
4550                 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
4551                  * prohibit write access to this file, and we shouldn't trip up over that. */
4552                 r = set_oom_score_adjust(context->oom_score_adjust);
4553                 if (ERRNO_IS_PRIVILEGE(r))
4554                         log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
4555                 else if (r < 0) {
4556                         *exit_status = EXIT_OOM_ADJUST;
4557                         return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
4558                 }
4559         }
4560
4561         if (context->coredump_filter_set) {
4562                 r = set_coredump_filter(context->coredump_filter);
4563                 if (ERRNO_IS_PRIVILEGE(r))
4564                         log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
4565                 else if (r < 0)
4566                         return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
4567         }
4568
4569         if (context->nice_set) {
4570                 r = setpriority_closest(context->nice);
4571                 if (r < 0)
4572                         return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
4573         }
4574
4575         if (context->cpu_sched_set) {
4576                 struct sched_param param = {
4577                         .sched_priority = context->cpu_sched_priority,
4578                 };
4579
4580                 r = sched_setscheduler(0,
4581                                        context->cpu_sched_policy |
4582                                        (context->cpu_sched_reset_on_fork ?
4583                                         SCHED_RESET_ON_FORK : 0),
4584                                        &param);
4585                 if (r < 0) {
4586                         *exit_status = EXIT_SETSCHEDULER;
4587                         return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
4588                 }
4589         }
4590
4591         if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4592                 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4593                 const CPUSet *cpu_set;
4594
4595                 if (context->cpu_affinity_from_numa) {
4596                         r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4597                         if (r < 0) {
4598                                 *exit_status = EXIT_CPUAFFINITY;
4599                                 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4600                         }
4601
4602                         cpu_set = &converted_cpu_set;
4603                 } else
4604                         cpu_set = &context->cpu_set;
4605
4606                 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
4607                         *exit_status = EXIT_CPUAFFINITY;
4608                         return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
4609                 }
4610         }
4611
4612         if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4613                 r = apply_numa_policy(&context->numa_policy);
4614                 if (r == -EOPNOTSUPP)
4615                         log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
4616                 else if (r < 0) {
4617                         *exit_status = EXIT_NUMA_POLICY;
4618                         return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
4619                 }
4620         }
4621
4622         if (context->ioprio_set)
4623                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
4624                         *exit_status = EXIT_IOPRIO;
4625                         return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
4626                 }
4627
4628         if (context->timer_slack_nsec != NSEC_INFINITY)
4629                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
4630                         *exit_status = EXIT_TIMERSLACK;
4631                         return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
4632                 }
4633
4634         if (context->personality != PERSONALITY_INVALID) {
4635                 r = safe_personality(context->personality);
4636                 if (r < 0) {
4637                         *exit_status = EXIT_PERSONALITY;
4638                         return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
4639                 }
4640         }
4641
4642         if (context->utmp_id) {
4643                 const char *line = context->tty_path ?
4644                         (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
4645                         NULL;
4646                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
4647                                       line,
4648                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
4649                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4650                                       USER_PROCESS,
4651                                       username);
4652         }
4653
4654         if (uid_is_valid(uid)) {
4655                 r = chown_terminal(STDIN_FILENO, uid);
4656                 if (r < 0) {
4657                         *exit_status = EXIT_STDIN;
4658                         return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
4659                 }
4660         }
4661
4662         /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
4663          * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4664          * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
4665          * touch a single hierarchy too. */
4666         if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
4667                 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
4668                 if (r < 0) {
4669                         *exit_status = EXIT_CGROUP;
4670                         return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
4671                 }
4672         }
4673
4674         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
4675
4676         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4677                 r = setup_exec_directory(context, params, uid, gid, dt, needs_mount_namespace, exit_status);
4678                 if (r < 0)
4679                         return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
4680         }
4681
4682         if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4683                 r = setup_credentials(context, params, unit->id, uid);
4684                 if (r < 0) {
4685                         *exit_status = EXIT_CREDENTIALS;
4686                         return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
4687                 }
4688         }
4689
4690         r = build_environment(
4691                         unit,
4692                         context,
4693                         params,
4694                         n_fds,
4695                         fdnames,
4696                         home,
4697                         username,
4698                         shell,
4699                         journal_stream_dev,
4700                         journal_stream_ino,
4701                         &our_env);
4702         if (r < 0) {
4703                 *exit_status = EXIT_MEMORY;
4704                 return log_oom();
4705         }
4706
4707         r = build_pass_environment(context, &pass_env);
4708         if (r < 0) {
4709                 *exit_status = EXIT_MEMORY;
4710                 return log_oom();
4711         }
4712
4713         /* The $PATH variable is set to the default path in params->environment. However, this is overridden
4714          * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
4715          * not specify PATH but the unit has ExecSearchPath. */
4716         if (!strv_isempty(context->exec_search_path)) {
4717                 _cleanup_free_ char *joined = NULL;
4718
4719                 joined = strv_join(context->exec_search_path, ":");
4720                 if (!joined) {
4721                         *exit_status = EXIT_MEMORY;
4722                         return log_oom();
4723                 }
4724
4725                 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
4726                 if (r < 0) {
4727                         *exit_status = EXIT_MEMORY;
4728                         return log_oom();
4729                 }
4730         }
4731
4732         accum_env = strv_env_merge(params->environment,
4733                                    our_env,
4734                                    joined_exec_search_path,
4735                                    pass_env,
4736                                    context->environment,
4737                                    files_env);
4738         if (!accum_env) {
4739                 *exit_status = EXIT_MEMORY;
4740                 return log_oom();
4741         }
4742         accum_env = strv_env_clean(accum_env);
4743
4744         (void) umask(context->umask);
4745
4746         r = setup_keyring(unit, context, params, uid, gid);
4747         if (r < 0) {
4748                 *exit_status = EXIT_KEYRING;
4749                 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
4750         }
4751
4752         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
4753          * from it. */
4754         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
4755
4756         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
4757          * for it, and the kernel doesn't actually support ambient caps. */
4758         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
4759
4760         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
4761          * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
4762          * desired. */
4763         if (needs_ambient_hack)
4764                 needs_setuid = false;
4765         else
4766                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4767
4768         if (needs_sandboxing) {
4769                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
4770                  * /sys being present. The actual MAC context application will happen later, as late as
4771                  * possible, to avoid impacting our own code paths. */
4772
4773 #if HAVE_SELINUX
4774                 use_selinux = mac_selinux_use();
4775 #endif
4776 #if ENABLE_SMACK
4777                 use_smack = mac_smack_use();
4778 #endif
4779 #if HAVE_APPARMOR
4780                 use_apparmor = mac_apparmor_use();
4781 #endif
4782         }
4783
4784         if (needs_sandboxing) {
4785                 int which_failed;
4786
4787                 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4788                  * is set here. (See below.) */
4789
4790                 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4791                 if (r < 0) {
4792                         *exit_status = EXIT_LIMITS;
4793                         return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4794                 }
4795         }
4796
4797         if (needs_setuid && context->pam_name && username) {
4798                 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
4799                  * wins here. (See above.) */
4800
4801                 /* All fds passed in the fds array will be closed in the pam child process. */
4802                 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
4803                 if (r < 0) {
4804                         *exit_status = EXIT_PAM;
4805                         return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
4806                 }
4807
4808                 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4809                 if (ngids_after_pam < 0) {
4810                         *exit_status = EXIT_MEMORY;
4811                         return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
4812                 }
4813         }
4814
4815         if (needs_sandboxing && context->private_users && have_effective_cap(CAP_SYS_ADMIN) <= 0) {
4816                 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4817                  * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4818                  * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
4819
4820                 userns_set_up = true;
4821                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4822                 if (r < 0) {
4823                         *exit_status = EXIT_USER;
4824                         return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
4825                 }
4826         }
4827
4828         if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
4829
4830                 if (ns_type_supported(NAMESPACE_NET)) {
4831                         r = setup_shareable_ns(runtime->netns_storage_socket, CLONE_NEWNET);
4832                         if (r == -EPERM)
4833                                 log_unit_warning_errno(unit, r,
4834                                                        "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
4835                         else if (r < 0) {
4836                                 *exit_status = EXIT_NETWORK;
4837                                 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
4838                         }
4839                 } else if (context->network_namespace_path) {
4840                         *exit_status = EXIT_NETWORK;
4841                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4842                                                     "NetworkNamespacePath= is not supported, refusing.");
4843                 } else
4844                         log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
4845         }
4846
4847         if ((context->private_ipc || context->ipc_namespace_path) && runtime && runtime->ipcns_storage_socket[0] >= 0) {
4848
4849                 if (ns_type_supported(NAMESPACE_IPC)) {
4850                         r = setup_shareable_ns(runtime->ipcns_storage_socket, CLONE_NEWIPC);
4851                         if (r == -EPERM)
4852                                 log_unit_warning_errno(unit, r,
4853                                                        "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4854                         else if (r < 0) {
4855                                 *exit_status = EXIT_NAMESPACE;
4856                                 return log_unit_error_errno(unit, r, "Failed to set up IPC namespacing: %m");
4857                         }
4858                 } else if (context->ipc_namespace_path) {
4859                         *exit_status = EXIT_NAMESPACE;
4860                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4861                                                     "IPCNamespacePath= is not supported, refusing.");
4862                 } else
4863                         log_unit_warning(unit, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4864         }
4865
4866         if (needs_mount_namespace) {
4867                 _cleanup_free_ char *error_path = NULL;
4868
4869                 r = apply_mount_namespace(unit, command->flags, context, params, runtime, &error_path);
4870                 if (r < 0) {
4871                         *exit_status = EXIT_NAMESPACE;
4872                         return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
4873                                                     error_path ? ": " : "", strempty(error_path));
4874                 }
4875         }
4876
4877         if (needs_sandboxing) {
4878                 r = apply_protect_hostname(unit, context, exit_status);
4879                 if (r < 0)
4880                         return r;
4881         }
4882
4883         /* Drop groups as early as possible.
4884          * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
4885          * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
4886         if (needs_setuid) {
4887                 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4888                 int ngids_to_enforce = 0;
4889
4890                 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4891                                                    ngids,
4892                                                    gids_after_pam,
4893                                                    ngids_after_pam,
4894                                                    &gids_to_enforce);
4895                 if (ngids_to_enforce < 0) {
4896                         *exit_status = EXIT_MEMORY;
4897                         return log_unit_error_errno(unit,
4898                                                     ngids_to_enforce,
4899                                                     "Failed to merge group lists. Group membership might be incorrect: %m");
4900                 }
4901
4902                 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
4903                 if (r < 0) {
4904                         *exit_status = EXIT_GROUP;
4905                         return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
4906                 }
4907         }
4908
4909         /* If the user namespace was not set up above, try to do it now.
4910          * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4911          * restricted by rules pertaining to combining user namspaces with other namespaces (e.g. in the
4912          * case of mount namespaces being less privileged when the mount point list is copied from a
4913          * different user namespace). */
4914
4915         if (needs_sandboxing && context->private_users && !userns_set_up) {
4916                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4917                 if (r < 0) {
4918                         *exit_status = EXIT_USER;
4919                         return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
4920                 }
4921         }
4922
4923         /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4924          * shall execute. */
4925
4926         _cleanup_free_ char *executable = NULL;
4927         _cleanup_close_ int executable_fd = -EBADF;
4928         r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
4929         if (r < 0) {
4930                 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
4931                         log_unit_struct_errno(unit, LOG_INFO, r,
4932                                               "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4933                                               LOG_UNIT_INVOCATION_ID(unit),
4934                                               LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
4935                                                                command->path),
4936                                               "EXECUTABLE=%s", command->path);
4937                         return 0;
4938                 }
4939
4940                 *exit_status = EXIT_EXEC;
4941
4942                 return log_unit_struct_errno(unit, LOG_INFO, r,
4943                                              "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4944                                              LOG_UNIT_INVOCATION_ID(unit),
4945                                              LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
4946                                                               command->path),
4947                                              "EXECUTABLE=%s", command->path);
4948         }
4949
4950         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
4951         if (r < 0) {
4952                 *exit_status = EXIT_FDS;
4953                 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4954         }
4955
4956 #if HAVE_SELINUX
4957         if (needs_sandboxing && use_selinux && params->selinux_context_net) {
4958                 int fd = -EBADF;
4959
4960                 if (socket_fd >= 0)
4961                         fd = socket_fd;
4962                 else if (params->n_socket_fds == 1)
4963                         /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4964                          * use context from that fd to compute the label. */
4965                         fd = params->fds[0];
4966
4967                 if (fd >= 0) {
4968                         r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
4969                         if (r < 0) {
4970                                 if (!context->selinux_context_ignore) {
4971                                         *exit_status = EXIT_SELINUX_CONTEXT;
4972                                         return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
4973                                 }
4974                                 log_unit_debug_errno(unit, r, "Failed to determine SELinux context, ignoring: %m");
4975                         }
4976                 }
4977         }
4978 #endif
4979
4980         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
4981          * more aggressive this time since socket_fd and the netns and ipcns fds we don't need anymore. We do keep the exec_fd
4982          * however if we have it as we want to keep it open until the final execve(). */
4983
4984         r = close_all_fds(keep_fds, n_keep_fds);
4985         if (r >= 0)
4986                 r = shift_fds(fds, n_fds);
4987         if (r >= 0)
4988                 r = flags_fds(fds, n_socket_fds, n_fds, context->non_blocking);
4989         if (r < 0) {
4990                 *exit_status = EXIT_FDS;
4991                 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
4992         }
4993
4994         /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4995          * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4996          * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4997          * came this far. */
4998
4999         secure_bits = context->secure_bits;
5000
5001         if (needs_sandboxing) {
5002                 uint64_t bset;
5003
5004                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
5005                  * requested. (Note this is placed after the general resource limit initialization, see
5006                  * above, in order to take precedence.) */
5007                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
5008                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
5009                                 *exit_status = EXIT_LIMITS;
5010                                 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
5011                         }
5012                 }
5013
5014 #if ENABLE_SMACK
5015                 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
5016                  * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
5017                 if (use_smack) {
5018                         r = setup_smack(unit->manager, context, executable_fd);
5019                         if (r < 0 && !context->smack_process_label_ignore) {
5020                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
5021                                 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
5022                         }
5023                 }
5024 #endif
5025
5026                 bset = context->capability_bounding_set;
5027                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
5028                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
5029                  * instead of us doing that */
5030                 if (needs_ambient_hack)
5031                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
5032                                 (UINT64_C(1) << CAP_SETUID) |
5033                                 (UINT64_C(1) << CAP_SETGID);
5034
5035                 if (!cap_test_all(bset)) {
5036                         r = capability_bounding_set_drop(bset, false);
5037                         if (r < 0) {
5038                                 *exit_status = EXIT_CAPABILITIES;
5039                                 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
5040                         }
5041                 }
5042
5043                 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
5044                  * keep-caps set.
5045                  * To be able to raise the ambient capabilities after setresuid() they have to be
5046                  * added to the inherited set and keep caps has to be set (done in enforce_user()).
5047                  * After setresuid() the ambient capabilities can be raised as they are present in
5048                  * the permitted and inhertiable set. However it is possible that someone wants to
5049                  * set ambient capabilities without changing the user, so we also set the ambient
5050                  * capabilities here.
5051                  * The requested ambient capabilities are raised in the inheritable set if the
5052                  * second argument is true. */
5053                 if (!needs_ambient_hack) {
5054                         r = capability_ambient_set_apply(context->capability_ambient_set, true);
5055                         if (r < 0) {
5056                                 *exit_status = EXIT_CAPABILITIES;
5057                                 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
5058                         }
5059                 }
5060         }
5061
5062         /* chroot to root directory first, before we lose the ability to chroot */
5063         r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
5064         if (r < 0)
5065                 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
5066
5067         if (needs_setuid) {
5068                 if (uid_is_valid(uid)) {
5069                         r = enforce_user(context, uid);
5070                         if (r < 0) {
5071                                 *exit_status = EXIT_USER;
5072                                 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
5073                         }
5074
5075                         if (!needs_ambient_hack &&
5076                             context->capability_ambient_set != 0) {
5077
5078                                 /* Raise the ambient capabilities after user change. */
5079                                 r = capability_ambient_set_apply(context->capability_ambient_set, false);
5080                                 if (r < 0) {
5081                                         *exit_status = EXIT_CAPABILITIES;
5082                                         return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
5083                                 }
5084                         }
5085                 }
5086         }
5087
5088         /* Apply working directory here, because the working directory might be on NFS and only the user running
5089          * this service might have the correct privilege to change to the working directory */
5090         r = apply_working_directory(context, params, home, exit_status);
5091         if (r < 0)
5092                 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
5093
5094         if (needs_sandboxing) {
5095                 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
5096                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
5097                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
5098                  * are restricted. */
5099
5100 #if HAVE_SELINUX
5101                 if (use_selinux) {
5102                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
5103
5104                         if (exec_context) {
5105                                 r = setexeccon(exec_context);
5106                                 if (r < 0) {
5107                                         if (!context->selinux_context_ignore) {
5108                                                 *exit_status = EXIT_SELINUX_CONTEXT;
5109                                                 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
5110                                         }
5111                                         log_unit_debug_errno(unit, r, "Failed to change SELinux context to %s, ignoring: %m", exec_context);
5112                                 }
5113                         }
5114                 }
5115 #endif
5116
5117 #if HAVE_APPARMOR
5118                 if (use_apparmor && context->apparmor_profile) {
5119                         r = aa_change_onexec(context->apparmor_profile);
5120                         if (r < 0 && !context->apparmor_profile_ignore) {
5121                                 *exit_status = EXIT_APPARMOR_PROFILE;
5122                                 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
5123                         }
5124                 }
5125 #endif
5126
5127                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
5128                  * we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits requires
5129                  * CAP_SETPCAP. */
5130                 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
5131                         /* CAP_SETPCAP is required to set securebits. This capability is raised into the
5132                          * effective set here.
5133                          * The effective set is overwritten during execve  with the following  values:
5134                          * - ambient set (for non-root processes)
5135                          * - (inheritable | bounding) set for root processes)
5136                          *
5137                          * Hence there is no security impact to raise it in the effective set before execve
5138                          */
5139                         r = capability_gain_cap_setpcap(NULL);
5140                         if (r < 0) {
5141                                 *exit_status = EXIT_CAPABILITIES;
5142                                 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
5143                         }
5144                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
5145                                 *exit_status = EXIT_SECUREBITS;
5146                                 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
5147                         }
5148                 }
5149
5150                 if (context_has_no_new_privileges(context))
5151                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
5152                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
5153                                 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
5154                         }
5155
5156 #if HAVE_SECCOMP
5157                 r = apply_address_families(unit, context);
5158                 if (r < 0) {
5159                         *exit_status = EXIT_ADDRESS_FAMILIES;
5160                         return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
5161                 }
5162
5163                 r = apply_memory_deny_write_execute(unit, context);
5164                 if (r < 0) {
5165                         *exit_status = EXIT_SECCOMP;
5166                         return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
5167                 }
5168
5169                 r = apply_restrict_realtime(unit, context);
5170                 if (r < 0) {
5171                         *exit_status = EXIT_SECCOMP;
5172                         return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
5173                 }
5174
5175                 r = apply_restrict_suid_sgid(unit, context);
5176                 if (r < 0) {
5177                         *exit_status = EXIT_SECCOMP;
5178                         return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
5179                 }
5180
5181                 r = apply_restrict_namespaces(unit, context);
5182                 if (r < 0) {
5183                         *exit_status = EXIT_SECCOMP;
5184                         return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
5185                 }
5186
5187                 r = apply_protect_sysctl(unit, context);
5188                 if (r < 0) {
5189                         *exit_status = EXIT_SECCOMP;
5190                         return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
5191                 }
5192
5193                 r = apply_protect_kernel_modules(unit, context);
5194                 if (r < 0) {
5195                         *exit_status = EXIT_SECCOMP;
5196                         return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
5197                 }
5198
5199                 r = apply_protect_kernel_logs(unit, context);
5200                 if (r < 0) {
5201                         *exit_status = EXIT_SECCOMP;
5202                         return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
5203                 }
5204
5205                 r = apply_protect_clock(unit, context);
5206                 if (r < 0) {
5207                         *exit_status = EXIT_SECCOMP;
5208                         return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
5209                 }
5210
5211                 r = apply_private_devices(unit, context);
5212                 if (r < 0) {
5213                         *exit_status = EXIT_SECCOMP;
5214                         return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
5215                 }
5216
5217                 r = apply_syscall_archs(unit, context);
5218                 if (r < 0) {
5219                         *exit_status = EXIT_SECCOMP;
5220                         return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
5221                 }
5222
5223                 r = apply_lock_personality(unit, context);
5224                 if (r < 0) {
5225                         *exit_status = EXIT_SECCOMP;
5226                         return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
5227                 }
5228
5229                 r = apply_syscall_log(unit, context);
5230                 if (r < 0) {
5231                         *exit_status = EXIT_SECCOMP;
5232                         return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
5233                 }
5234
5235                 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
5236                  * by the filter as little as possible. */
5237                 r = apply_syscall_filter(unit, context, needs_ambient_hack);
5238                 if (r < 0) {
5239                         *exit_status = EXIT_SECCOMP;
5240                         return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
5241                 }
5242 #endif
5243
5244 #if HAVE_LIBBPF
5245                 r = apply_restrict_filesystems(unit, context);
5246                 if (r < 0) {
5247                         *exit_status = EXIT_BPF;
5248                         return log_unit_error_errno(unit, r, "Failed to restrict filesystems: %m");
5249                 }
5250 #endif
5251
5252         }
5253
5254         if (!strv_isempty(context->unset_environment)) {
5255                 char **ee = NULL;
5256
5257                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
5258                 if (!ee) {
5259                         *exit_status = EXIT_MEMORY;
5260                         return log_oom();
5261                 }
5262
5263                 strv_free_and_replace(accum_env, ee);
5264         }
5265
5266         if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
5267                 replaced_argv = replace_env_argv(command->argv, accum_env);
5268                 if (!replaced_argv) {
5269                         *exit_status = EXIT_MEMORY;
5270                         return log_oom();
5271                 }
5272                 final_argv = replaced_argv;
5273         } else
5274                 final_argv = command->argv;
5275
5276         if (DEBUG_LOGGING) {
5277                 _cleanup_free_ char *line = NULL;
5278
5279                 line = quote_command_line(final_argv, SHELL_ESCAPE_EMPTY);
5280                 if (!line) {
5281                         *exit_status = EXIT_MEMORY;
5282                         return log_oom();
5283                 }
5284
5285                 log_unit_struct(unit, LOG_DEBUG,
5286                                 "EXECUTABLE=%s", executable,
5287                                 LOG_UNIT_MESSAGE(unit, "Executing: %s", line));
5288         }
5289
5290         if (exec_fd >= 0) {
5291                 uint8_t hot = 1;
5292
5293                 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5294                  * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5295
5296                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5297                         *exit_status = EXIT_EXEC;
5298                         return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
5299                 }
5300         }
5301
5302         r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5303
5304         if (exec_fd >= 0) {
5305                 uint8_t hot = 0;
5306
5307                 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5308                  * that POLLHUP on it no longer means execve() succeeded. */
5309
5310                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5311                         *exit_status = EXIT_EXEC;
5312                         return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
5313                 }
5314         }
5315
5316         *exit_status = EXIT_EXEC;
5317         return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
5318 }
5319
5320 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
5321 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
5322
5323 int exec_spawn(Unit *unit,
5324                ExecCommand *command,
5325                const ExecContext *context,
5326                const ExecParameters *params,
5327                ExecRuntime *runtime,
5328                DynamicCreds *dcreds,
5329                pid_t *ret) {
5330
5331         int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
5332         _cleanup_free_ char *subcgroup_path = NULL;
5333         _cleanup_strv_free_ char **files_env = NULL;
5334         size_t n_storage_fds = 0, n_socket_fds = 0;
5335         _cleanup_free_ char *line = NULL;
5336         pid_t pid;
5337
5338         assert(unit);
5339         assert(command);
5340         assert(context);
5341         assert(ret);
5342         assert(params);
5343         assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
5344
5345         if (context->std_input == EXEC_INPUT_SOCKET ||
5346             context->std_output == EXEC_OUTPUT_SOCKET ||
5347             context->std_error == EXEC_OUTPUT_SOCKET) {
5348
5349                 if (params->n_socket_fds > 1)
5350                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
5351
5352                 if (params->n_socket_fds == 0)
5353                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
5354
5355                 socket_fd = params->fds[0];
5356         } else {
5357                 socket_fd = -EBADF;
5358                 fds = params->fds;
5359                 n_socket_fds = params->n_socket_fds;
5360                 n_storage_fds = params->n_storage_fds;
5361         }
5362
5363         r = exec_context_named_iofds(context, params, named_iofds);
5364         if (r < 0)
5365                 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
5366
5367         r = exec_context_load_environment(unit, context, &files_env);
5368         if (r < 0)
5369                 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
5370
5371         line = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
5372         if (!line)
5373                 return log_oom();
5374
5375         /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
5376            and, until the next SELinux policy changes, we save further reloads in future children. */
5377         mac_selinux_maybe_reload();
5378
5379         log_unit_struct(unit, LOG_DEBUG,
5380                         LOG_UNIT_MESSAGE(unit, "About to execute %s", line),
5381                         "EXECUTABLE=%s", command->path, /* We won't know the real executable path until we create
5382                                                            the mount namespace in the child, but we want to log
5383                                                            from the parent, so we need to use the (possibly
5384                                                            inaccurate) path here. */
5385                         LOG_UNIT_INVOCATION_ID(unit));
5386
5387         if (params->cgroup_path) {
5388                 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
5389                 if (r < 0)
5390                         return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
5391                 if (r > 0) { /* We are using a child cgroup */
5392                         r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
5393                         if (r < 0)
5394                                 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
5395
5396                         /* Normally we would not propagate the xattrs to children but since we created this
5397                          * sub-cgroup internally we should do it. */
5398                         cgroup_oomd_xattr_apply(unit, subcgroup_path);
5399                         cgroup_log_xattr_apply(unit, subcgroup_path);
5400                 }
5401         }
5402
5403         pid = fork();
5404         if (pid < 0)
5405                 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
5406
5407         if (pid == 0) {
5408                 int exit_status = EXIT_SUCCESS;
5409
5410                 r = exec_child(unit,
5411                                command,
5412                                context,
5413                                params,
5414                                runtime,
5415                                dcreds,
5416                                socket_fd,
5417                                named_iofds,
5418                                fds,
5419                                n_socket_fds,
5420                                n_storage_fds,
5421                                files_env,
5422                                unit->manager->user_lookup_fds[1],
5423                                &exit_status);
5424
5425                 if (r < 0) {
5426                         const char *status =
5427                                 exit_status_to_string(exit_status,
5428                                                       EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
5429
5430                         log_unit_struct_errno(unit, LOG_ERR, r,
5431                                               "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5432                                               LOG_UNIT_INVOCATION_ID(unit),
5433                                               LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
5434                                                                status, command->path),
5435                                               "EXECUTABLE=%s", command->path);
5436                 }
5437
5438                 _exit(exit_status);
5439         }
5440
5441         log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
5442
5443         /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
5444          * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
5445          * process will be killed too). */
5446         if (subcgroup_path)
5447                 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
5448
5449         exec_status_start(&command->exec_status, pid);
5450
5451         *ret = pid;
5452         return 0;
5453 }
5454
5455 void exec_context_init(ExecContext *c) {
5456         assert(c);
5457
5458         c->umask = 0022;
5459         c->ioprio = IOPRIO_DEFAULT_CLASS_AND_PRIO;
5460         c->cpu_sched_policy = SCHED_OTHER;
5461         c->syslog_priority = LOG_DAEMON|LOG_INFO;
5462         c->syslog_level_prefix = true;
5463         c->ignore_sigpipe = true;
5464         c->timer_slack_nsec = NSEC_INFINITY;
5465         c->personality = PERSONALITY_INVALID;
5466         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5467                 c->directories[t].mode = 0755;
5468         c->timeout_clean_usec = USEC_INFINITY;
5469         c->capability_bounding_set = CAP_ALL;
5470         assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
5471         c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
5472         c->log_level_max = -1;
5473 #if HAVE_SECCOMP
5474         c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL;
5475 #endif
5476         c->tty_rows = UINT_MAX;
5477         c->tty_cols = UINT_MAX;
5478         numa_policy_reset(&c->numa_policy);
5479 }
5480
5481 void exec_context_done(ExecContext *c) {
5482         assert(c);
5483
5484         c->environment = strv_free(c->environment);
5485         c->environment_files = strv_free(c->environment_files);
5486         c->pass_environment = strv_free(c->pass_environment);
5487         c->unset_environment = strv_free(c->unset_environment);
5488
5489         rlimit_free_all(c->rlimit);
5490
5491         for (size_t l = 0; l < 3; l++) {
5492                 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
5493                 c->stdio_file[l] = mfree(c->stdio_file[l]);
5494         }
5495
5496         c->working_directory = mfree(c->working_directory);
5497         c->root_directory = mfree(c->root_directory);
5498         c->root_image = mfree(c->root_image);
5499         c->root_image_options = mount_options_free_all(c->root_image_options);
5500         c->root_hash = mfree(c->root_hash);
5501         c->root_hash_size = 0;
5502         c->root_hash_path = mfree(c->root_hash_path);
5503         c->root_hash_sig = mfree(c->root_hash_sig);
5504         c->root_hash_sig_size = 0;
5505         c->root_hash_sig_path = mfree(c->root_hash_sig_path);
5506         c->root_verity = mfree(c->root_verity);
5507         c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
5508         c->extension_directories = strv_free(c->extension_directories);
5509         c->tty_path = mfree(c->tty_path);
5510         c->syslog_identifier = mfree(c->syslog_identifier);
5511         c->user = mfree(c->user);
5512         c->group = mfree(c->group);
5513
5514         c->supplementary_groups = strv_free(c->supplementary_groups);
5515
5516         c->pam_name = mfree(c->pam_name);
5517
5518         c->read_only_paths = strv_free(c->read_only_paths);
5519         c->read_write_paths = strv_free(c->read_write_paths);
5520         c->inaccessible_paths = strv_free(c->inaccessible_paths);
5521         c->exec_paths = strv_free(c->exec_paths);
5522         c->no_exec_paths = strv_free(c->no_exec_paths);
5523         c->exec_search_path = strv_free(c->exec_search_path);
5524
5525         bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
5526         c->bind_mounts = NULL;
5527         c->n_bind_mounts = 0;
5528         temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
5529         c->temporary_filesystems = NULL;
5530         c->n_temporary_filesystems = 0;
5531         c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
5532
5533         cpu_set_reset(&c->cpu_set);
5534         numa_policy_reset(&c->numa_policy);
5535
5536         c->utmp_id = mfree(c->utmp_id);
5537         c->selinux_context = mfree(c->selinux_context);
5538         c->apparmor_profile = mfree(c->apparmor_profile);
5539         c->smack_process_label = mfree(c->smack_process_label);
5540
5541         c->restrict_filesystems = set_free(c->restrict_filesystems);
5542
5543         c->syscall_filter = hashmap_free(c->syscall_filter);
5544         c->syscall_archs = set_free(c->syscall_archs);
5545         c->address_families = set_free(c->address_families);
5546
5547         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5548                 exec_directory_done(&c->directories[t]);
5549
5550         c->log_level_max = -1;
5551
5552         exec_context_free_log_extra_fields(c);
5553         c->log_filter_allowed_patterns = set_free(c->log_filter_allowed_patterns);
5554         c->log_filter_denied_patterns = set_free(c->log_filter_denied_patterns);
5555
5556         c->log_ratelimit_interval_usec = 0;
5557         c->log_ratelimit_burst = 0;
5558
5559         c->stdin_data = mfree(c->stdin_data);
5560         c->stdin_data_size = 0;
5561
5562         c->network_namespace_path = mfree(c->network_namespace_path);
5563         c->ipc_namespace_path = mfree(c->ipc_namespace_path);
5564
5565         c->log_namespace = mfree(c->log_namespace);
5566
5567         c->load_credentials = hashmap_free(c->load_credentials);
5568         c->set_credentials = hashmap_free(c->set_credentials);
5569 }
5570
5571 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
5572         assert(c);
5573
5574         if (!runtime_prefix)
5575                 return 0;
5576
5577         for (size_t i = 0; i < c->directories[EXEC_DIRECTORY_RUNTIME].n_items; i++) {
5578                 _cleanup_free_ char *p = NULL;
5579
5580                 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5581                         p = path_join(runtime_prefix, "private", c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
5582                 else
5583                         p = path_join(runtime_prefix, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
5584                 if (!p)
5585                         return -ENOMEM;
5586
5587                 /* We execute this synchronously, since we need to be sure this is gone when we start the
5588                  * service next. */
5589                 (void) rm_rf(p, REMOVE_ROOT);
5590
5591                 STRV_FOREACH(symlink, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].symlinks) {
5592                         _cleanup_free_ char *symlink_abs = NULL;
5593
5594                         if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5595                                 symlink_abs = path_join(runtime_prefix, "private", *symlink);
5596                         else
5597                                 symlink_abs = path_join(runtime_prefix, *symlink);
5598                         if (!symlink_abs)
5599                                 return -ENOMEM;
5600
5601                         (void) unlink(symlink_abs);
5602                 }
5603
5604         }
5605
5606         return 0;
5607 }
5608
5609 int exec_context_destroy_credentials(const ExecContext *c, const char *runtime_prefix, const char *unit) {
5610         _cleanup_free_ char *p = NULL;
5611
5612         assert(c);
5613
5614         if (!runtime_prefix || !unit)
5615                 return 0;
5616
5617         p = path_join(runtime_prefix, "credentials", unit);
5618         if (!p)
5619                 return -ENOMEM;
5620
5621         /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
5622          * unmount it, and afterwards remove the mount point */
5623         (void) umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW);
5624         (void) rm_rf(p, REMOVE_ROOT|REMOVE_CHMOD);
5625
5626         return 0;
5627 }
5628
5629 static void exec_command_done(ExecCommand *c) {
5630         assert(c);
5631
5632         c->path = mfree(c->path);
5633         c->argv = strv_free(c->argv);
5634 }
5635
5636 void exec_command_done_array(ExecCommand *c, size_t n) {
5637         for (size_t i = 0; i < n; i++)
5638                 exec_command_done(c+i);
5639 }
5640
5641 ExecCommand* exec_command_free_list(ExecCommand *c) {
5642         ExecCommand *i;
5643
5644         while ((i = c)) {
5645                 LIST_REMOVE(command, c, i);
5646                 exec_command_done(i);
5647                 free(i);
5648         }
5649
5650         return NULL;
5651 }
5652
5653 void exec_command_free_array(ExecCommand **c, size_t n) {
5654         for (size_t i = 0; i < n; i++)
5655                 c[i] = exec_command_free_list(c[i]);
5656 }
5657
5658 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
5659         for (size_t i = 0; i < n; i++)
5660                 exec_status_reset(&c[i].exec_status);
5661 }
5662
5663 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
5664         for (size_t i = 0; i < n; i++)
5665                 LIST_FOREACH(command, z, c[i])
5666                         exec_status_reset(&z->exec_status);
5667 }
5668
5669 typedef struct InvalidEnvInfo {
5670         const Unit *unit;
5671         const char *path;
5672 } InvalidEnvInfo;
5673
5674 static void invalid_env(const char *p, void *userdata) {
5675         InvalidEnvInfo *info = userdata;
5676
5677         log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
5678 }
5679
5680 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
5681         assert(c);
5682
5683         switch (fd_index) {
5684
5685         case STDIN_FILENO:
5686                 if (c->std_input != EXEC_INPUT_NAMED_FD)
5687                         return NULL;
5688
5689                 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
5690
5691         case STDOUT_FILENO:
5692                 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
5693                         return NULL;
5694
5695                 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
5696
5697         case STDERR_FILENO:
5698                 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
5699                         return NULL;
5700
5701                 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
5702
5703         default:
5704                 return NULL;
5705         }
5706 }
5707
5708 static int exec_context_named_iofds(
5709                 const ExecContext *c,
5710                 const ExecParameters *p,
5711                 int named_iofds[static 3]) {
5712
5713         size_t targets;
5714         const char* stdio_fdname[3];
5715         size_t n_fds;
5716
5717         assert(c);
5718         assert(p);
5719         assert(named_iofds);
5720
5721         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
5722                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
5723                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
5724
5725         for (size_t i = 0; i < 3; i++)
5726                 stdio_fdname[i] = exec_context_fdname(c, i);
5727
5728         n_fds = p->n_storage_fds + p->n_socket_fds;
5729
5730         for (size_t i = 0; i < n_fds  && targets > 0; i++)
5731                 if (named_iofds[STDIN_FILENO] < 0 &&
5732                     c->std_input == EXEC_INPUT_NAMED_FD &&
5733                     stdio_fdname[STDIN_FILENO] &&
5734                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
5735
5736                         named_iofds[STDIN_FILENO] = p->fds[i];
5737                         targets--;
5738
5739                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
5740                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
5741                            stdio_fdname[STDOUT_FILENO] &&
5742                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
5743
5744                         named_iofds[STDOUT_FILENO] = p->fds[i];
5745                         targets--;
5746
5747                 } else if (named_iofds[STDERR_FILENO] < 0 &&
5748                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
5749                            stdio_fdname[STDERR_FILENO] &&
5750                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
5751
5752                         named_iofds[STDERR_FILENO] = p->fds[i];
5753                         targets--;
5754                 }
5755
5756         return targets == 0 ? 0 : -ENOENT;
5757 }
5758
5759 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***ret) {
5760         _cleanup_strv_free_ char **v = NULL;
5761         int r;
5762
5763         assert(c);
5764         assert(ret);
5765
5766         STRV_FOREACH(i, c->environment_files) {
5767                 _cleanup_globfree_ glob_t pglob = {};
5768                 bool ignore = false;
5769                 char *fn = *i;
5770
5771                 if (fn[0] == '-') {
5772                         ignore = true;
5773                         fn++;
5774                 }
5775
5776                 if (!path_is_absolute(fn)) {
5777                         if (ignore)
5778                                 continue;
5779                         return -EINVAL;
5780                 }
5781
5782                 /* Filename supports globbing, take all matching files */
5783                 r = safe_glob(fn, 0, &pglob);
5784                 if (r < 0) {
5785                         if (ignore)
5786                                 continue;
5787                         return r;
5788                 }
5789
5790                 /* When we don't match anything, -ENOENT should be returned */
5791                 assert(pglob.gl_pathc > 0);
5792
5793                 for (unsigned n = 0; n < pglob.gl_pathc; n++) {
5794                         _cleanup_strv_free_ char **p = NULL;
5795
5796                         r = load_env_file(NULL, pglob.gl_pathv[n], &p);
5797                         if (r < 0) {
5798                                 if (ignore)
5799                                         continue;
5800                                 return r;
5801                         }
5802
5803                         /* Log invalid environment variables with filename */
5804                         if (p) {
5805                                 InvalidEnvInfo info = {
5806                                         .unit = unit,
5807                                         .path = pglob.gl_pathv[n]
5808                                 };
5809
5810                                 p = strv_env_clean_with_callback(p, invalid_env, &info);
5811                         }
5812
5813                         if (!v)
5814                                 v = TAKE_PTR(p);
5815                         else {
5816                                 char **m = strv_env_merge(v, p);
5817                                 if (!m)
5818                                         return -ENOMEM;
5819
5820                                 strv_free_and_replace(v, m);
5821                         }
5822                 }
5823         }
5824
5825         *ret = TAKE_PTR(v);
5826
5827         return 0;
5828 }
5829
5830 static bool tty_may_match_dev_console(const char *tty) {
5831         _cleanup_free_ char *resolved = NULL;
5832
5833         if (!tty)
5834                 return true;
5835
5836         tty = skip_dev_prefix(tty);
5837
5838         /* trivial identity? */
5839         if (streq(tty, "console"))
5840                 return true;
5841
5842         if (resolve_dev_console(&resolved) < 0)
5843                 return true; /* if we could not resolve, assume it may */
5844
5845         /* "tty0" means the active VC, so it may be the same sometimes */
5846         return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
5847 }
5848
5849 static bool exec_context_may_touch_tty(const ExecContext *ec) {
5850         assert(ec);
5851
5852         return ec->tty_reset ||
5853                 ec->tty_vhangup ||
5854                 ec->tty_vt_disallocate ||
5855                 is_terminal_input(ec->std_input) ||
5856                 is_terminal_output(ec->std_output) ||
5857                 is_terminal_output(ec->std_error);
5858 }
5859
5860 bool exec_context_may_touch_console(const ExecContext *ec) {
5861
5862         return exec_context_may_touch_tty(ec) &&
5863                tty_may_match_dev_console(exec_context_tty_path(ec));
5864 }
5865
5866 static void strv_fprintf(FILE *f, char **l) {
5867         assert(f);
5868
5869         STRV_FOREACH(g, l)
5870                 fprintf(f, " %s", *g);
5871 }
5872
5873 static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
5874         assert(f);
5875         assert(prefix);
5876         assert(name);
5877
5878         if (!strv_isempty(strv)) {
5879                 fprintf(f, "%s%s:", prefix, name);
5880                 strv_fprintf(f, strv);
5881                 fputs("\n", f);
5882         }
5883 }
5884
5885 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
5886         int r;
5887
5888         assert(c);
5889         assert(f);
5890
5891         prefix = strempty(prefix);
5892
5893         fprintf(f,
5894                 "%sUMask: %04o\n"
5895                 "%sWorkingDirectory: %s\n"
5896                 "%sRootDirectory: %s\n"
5897                 "%sNonBlocking: %s\n"
5898                 "%sPrivateTmp: %s\n"
5899                 "%sPrivateDevices: %s\n"
5900                 "%sProtectKernelTunables: %s\n"
5901                 "%sProtectKernelModules: %s\n"
5902                 "%sProtectKernelLogs: %s\n"
5903                 "%sProtectClock: %s\n"
5904                 "%sProtectControlGroups: %s\n"
5905                 "%sPrivateNetwork: %s\n"
5906                 "%sPrivateUsers: %s\n"
5907                 "%sProtectHome: %s\n"
5908                 "%sProtectSystem: %s\n"
5909                 "%sMountAPIVFS: %s\n"
5910                 "%sIgnoreSIGPIPE: %s\n"
5911                 "%sMemoryDenyWriteExecute: %s\n"
5912                 "%sRestrictRealtime: %s\n"
5913                 "%sRestrictSUIDSGID: %s\n"
5914                 "%sKeyringMode: %s\n"
5915                 "%sProtectHostname: %s\n"
5916                 "%sProtectProc: %s\n"
5917                 "%sProcSubset: %s\n",
5918                 prefix, c->umask,
5919                 prefix, empty_to_root(c->working_directory),
5920                 prefix, empty_to_root(c->root_directory),
5921                 prefix, yes_no(c->non_blocking),
5922                 prefix, yes_no(c->private_tmp),
5923                 prefix, yes_no(c->private_devices),
5924                 prefix, yes_no(c->protect_kernel_tunables),
5925                 prefix, yes_no(c->protect_kernel_modules),
5926                 prefix, yes_no(c->protect_kernel_logs),
5927                 prefix, yes_no(c->protect_clock),
5928                 prefix, yes_no(c->protect_control_groups),
5929                 prefix, yes_no(c->private_network),
5930                 prefix, yes_no(c->private_users),
5931                 prefix, protect_home_to_string(c->protect_home),
5932                 prefix, protect_system_to_string(c->protect_system),
5933                 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
5934                 prefix, yes_no(c->ignore_sigpipe),
5935                 prefix, yes_no(c->memory_deny_write_execute),
5936                 prefix, yes_no(c->restrict_realtime),
5937                 prefix, yes_no(c->restrict_suid_sgid),
5938                 prefix, exec_keyring_mode_to_string(c->keyring_mode),
5939                 prefix, yes_no(c->protect_hostname),
5940                 prefix, protect_proc_to_string(c->protect_proc),
5941                 prefix, proc_subset_to_string(c->proc_subset));
5942
5943         if (c->root_image)
5944                 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
5945
5946         if (c->root_image_options) {
5947                 fprintf(f, "%sRootImageOptions:", prefix);
5948                 LIST_FOREACH(mount_options, o, c->root_image_options)
5949                         if (!isempty(o->options))
5950                                 fprintf(f, " %s:%s",
5951                                         partition_designator_to_string(o->partition_designator),
5952                                         o->options);
5953                 fprintf(f, "\n");
5954         }
5955
5956         if (c->root_hash) {
5957                 _cleanup_free_ char *encoded = NULL;
5958                 encoded = hexmem(c->root_hash, c->root_hash_size);
5959                 if (encoded)
5960                         fprintf(f, "%sRootHash: %s\n", prefix, encoded);
5961         }
5962
5963         if (c->root_hash_path)
5964                 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
5965
5966         if (c->root_hash_sig) {
5967                 _cleanup_free_ char *encoded = NULL;
5968                 ssize_t len;
5969                 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
5970                 if (len)
5971                         fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
5972         }
5973
5974         if (c->root_hash_sig_path)
5975                 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
5976
5977         if (c->root_verity)
5978                 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
5979
5980         STRV_FOREACH(e, c->environment)
5981                 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
5982
5983         STRV_FOREACH(e, c->environment_files)
5984                 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
5985
5986         STRV_FOREACH(e, c->pass_environment)
5987                 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
5988
5989         STRV_FOREACH(e, c->unset_environment)
5990                 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
5991
5992         fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
5993
5994         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
5995                 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
5996
5997                 for (size_t i = 0; i < c->directories[dt].n_items; i++) {
5998                         fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].items[i].path);
5999
6000                         STRV_FOREACH(d, c->directories[dt].items[i].symlinks)
6001                                 fprintf(f, "%s%s: %s:%s\n", prefix, exec_directory_type_symlink_to_string(dt), c->directories[dt].items[i].path, *d);
6002                 }
6003         }
6004
6005         fprintf(f, "%sTimeoutCleanSec: %s\n", prefix, FORMAT_TIMESPAN(c->timeout_clean_usec, USEC_PER_SEC));
6006
6007         if (c->nice_set)
6008                 fprintf(f, "%sNice: %i\n", prefix, c->nice);
6009
6010         if (c->oom_score_adjust_set)
6011                 fprintf(f, "%sOOMScoreAdjust: %i\n", prefix, c->oom_score_adjust);
6012
6013         if (c->coredump_filter_set)
6014                 fprintf(f, "%sCoredumpFilter: 0x%"PRIx64"\n", prefix, c->coredump_filter);
6015
6016         for (unsigned i = 0; i < RLIM_NLIMITS; i++)
6017                 if (c->rlimit[i]) {
6018                         fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
6019                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
6020                         fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
6021                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
6022                 }
6023
6024         if (c->ioprio_set) {
6025                 _cleanup_free_ char *class_str = NULL;
6026
6027                 r = ioprio_class_to_string_alloc(ioprio_prio_class(c->ioprio), &class_str);
6028                 if (r >= 0)
6029                         fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
6030
6031                 fprintf(f, "%sIOPriority: %d\n", prefix, ioprio_prio_data(c->ioprio));
6032         }
6033
6034         if (c->cpu_sched_set) {
6035                 _cleanup_free_ char *policy_str = NULL;
6036
6037                 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
6038                 if (r >= 0)
6039                         fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
6040
6041                 fprintf(f,
6042                         "%sCPUSchedulingPriority: %i\n"
6043                         "%sCPUSchedulingResetOnFork: %s\n",
6044                         prefix, c->cpu_sched_priority,
6045                         prefix, yes_no(c->cpu_sched_reset_on_fork));
6046         }
6047
6048         if (c->cpu_set.set) {
6049                 _cleanup_free_ char *affinity = NULL;
6050
6051                 affinity = cpu_set_to_range_string(&c->cpu_set);
6052                 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
6053         }
6054
6055         if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
6056                 _cleanup_free_ char *nodes = NULL;
6057
6058                 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
6059                 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
6060                 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
6061         }
6062
6063         if (c->timer_slack_nsec != NSEC_INFINITY)
6064                 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
6065
6066         fprintf(f,
6067                 "%sStandardInput: %s\n"
6068                 "%sStandardOutput: %s\n"
6069                 "%sStandardError: %s\n",
6070                 prefix, exec_input_to_string(c->std_input),
6071                 prefix, exec_output_to_string(c->std_output),
6072                 prefix, exec_output_to_string(c->std_error));
6073
6074         if (c->std_input == EXEC_INPUT_NAMED_FD)
6075                 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
6076         if (c->std_output == EXEC_OUTPUT_NAMED_FD)
6077                 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
6078         if (c->std_error == EXEC_OUTPUT_NAMED_FD)
6079                 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
6080
6081         if (c->std_input == EXEC_INPUT_FILE)
6082                 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
6083         if (c->std_output == EXEC_OUTPUT_FILE)
6084                 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
6085         if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
6086                 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
6087         if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
6088                 fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
6089         if (c->std_error == EXEC_OUTPUT_FILE)
6090                 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
6091         if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
6092                 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
6093         if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
6094                 fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
6095
6096         if (c->tty_path)
6097                 fprintf(f,
6098                         "%sTTYPath: %s\n"
6099                         "%sTTYReset: %s\n"
6100                         "%sTTYVHangup: %s\n"
6101                         "%sTTYVTDisallocate: %s\n"
6102                         "%sTTYRows: %u\n"
6103                         "%sTTYColumns: %u\n",
6104                         prefix, c->tty_path,
6105                         prefix, yes_no(c->tty_reset),
6106                         prefix, yes_no(c->tty_vhangup),
6107                         prefix, yes_no(c->tty_vt_disallocate),
6108                         prefix, c->tty_rows,
6109                         prefix, c->tty_cols);
6110
6111         if (IN_SET(c->std_output,
6112                    EXEC_OUTPUT_KMSG,
6113                    EXEC_OUTPUT_JOURNAL,
6114                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
6115                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
6116             IN_SET(c->std_error,
6117                    EXEC_OUTPUT_KMSG,
6118                    EXEC_OUTPUT_JOURNAL,
6119                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
6120                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
6121
6122                 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
6123
6124                 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
6125                 if (r >= 0)
6126                         fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
6127
6128                 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
6129                 if (r >= 0)
6130                         fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
6131         }
6132
6133         if (c->log_level_max >= 0) {
6134                 _cleanup_free_ char *t = NULL;
6135
6136                 (void) log_level_to_string_alloc(c->log_level_max, &t);
6137
6138                 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
6139         }
6140
6141         if (c->log_ratelimit_interval_usec > 0)
6142                 fprintf(f,
6143                         "%sLogRateLimitIntervalSec: %s\n",
6144                         prefix, FORMAT_TIMESPAN(c->log_ratelimit_interval_usec, USEC_PER_SEC));
6145
6146         if (c->log_ratelimit_burst > 0)
6147                 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
6148
6149         if (!set_isempty(c->log_filter_allowed_patterns) || !set_isempty(c->log_filter_denied_patterns)) {
6150                 fprintf(f, "%sLogFilterPatterns:", prefix);
6151
6152                 char *pattern;
6153                 SET_FOREACH(pattern, c->log_filter_allowed_patterns)
6154                         fprintf(f, " %s", pattern);
6155                 SET_FOREACH(pattern, c->log_filter_denied_patterns)
6156                         fprintf(f, " ~%s", pattern);
6157                 fputc('\n', f);
6158         }
6159
6160         for (size_t j = 0; j < c->n_log_extra_fields; j++) {
6161                 fprintf(f, "%sLogExtraFields: ", prefix);
6162                 fwrite(c->log_extra_fields[j].iov_base,
6163                        1, c->log_extra_fields[j].iov_len,
6164                        f);
6165                 fputc('\n', f);
6166         }
6167
6168         if (c->log_namespace)
6169                 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
6170
6171         if (c->secure_bits) {
6172                 _cleanup_free_ char *str = NULL;
6173
6174                 r = secure_bits_to_string_alloc(c->secure_bits, &str);
6175                 if (r >= 0)
6176                         fprintf(f, "%sSecure Bits: %s\n", prefix, str);
6177         }
6178
6179         if (c->capability_bounding_set != CAP_ALL) {
6180                 _cleanup_free_ char *str = NULL;
6181
6182                 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
6183                 if (r >= 0)
6184                         fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
6185         }
6186
6187         if (c->capability_ambient_set != 0) {
6188                 _cleanup_free_ char *str = NULL;
6189
6190                 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
6191                 if (r >= 0)
6192                         fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
6193         }
6194
6195         if (c->user)
6196                 fprintf(f, "%sUser: %s\n", prefix, c->user);
6197         if (c->group)
6198                 fprintf(f, "%sGroup: %s\n", prefix, c->group);
6199
6200         fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
6201
6202         strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
6203
6204         if (c->pam_name)
6205                 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
6206
6207         strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
6208         strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
6209         strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
6210         strv_dump(f, prefix, "ExecPaths", c->exec_paths);
6211         strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
6212         strv_dump(f, prefix, "ExecSearchPath", c->exec_search_path);
6213
6214         for (size_t i = 0; i < c->n_bind_mounts; i++)
6215                 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
6216                         c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
6217                         c->bind_mounts[i].ignore_enoent ? "-": "",
6218                         c->bind_mounts[i].source,
6219                         c->bind_mounts[i].destination,
6220                         c->bind_mounts[i].recursive ? "rbind" : "norbind");
6221
6222         for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
6223                 const TemporaryFileSystem *t = c->temporary_filesystems + i;
6224
6225                 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
6226                         t->path,
6227                         isempty(t->options) ? "" : ":",
6228                         strempty(t->options));
6229         }
6230
6231         if (c->utmp_id)
6232                 fprintf(f,
6233                         "%sUtmpIdentifier: %s\n",
6234                         prefix, c->utmp_id);
6235
6236         if (c->selinux_context)
6237                 fprintf(f,
6238                         "%sSELinuxContext: %s%s\n",
6239                         prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
6240
6241         if (c->apparmor_profile)
6242                 fprintf(f,
6243                         "%sAppArmorProfile: %s%s\n",
6244                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
6245
6246         if (c->smack_process_label)
6247                 fprintf(f,
6248                         "%sSmackProcessLabel: %s%s\n",
6249                         prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
6250
6251         if (c->personality != PERSONALITY_INVALID)
6252                 fprintf(f,
6253                         "%sPersonality: %s\n",
6254                         prefix, strna(personality_to_string(c->personality)));
6255
6256         fprintf(f,
6257                 "%sLockPersonality: %s\n",
6258                 prefix, yes_no(c->lock_personality));
6259
6260         if (c->syscall_filter) {
6261                 fprintf(f,
6262                         "%sSystemCallFilter: ",
6263                         prefix);
6264
6265                 if (!c->syscall_allow_list)
6266                         fputc('~', f);
6267
6268 #if HAVE_SECCOMP
6269                 void *id, *val;
6270                 bool first = true;
6271                 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
6272                         _cleanup_free_ char *name = NULL;
6273                         const char *errno_name = NULL;
6274                         int num = PTR_TO_INT(val);
6275
6276                         if (first)
6277                                 first = false;
6278                         else
6279                                 fputc(' ', f);
6280
6281                         name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
6282                         fputs(strna(name), f);
6283
6284                         if (num >= 0) {
6285                                 errno_name = seccomp_errno_or_action_to_string(num);
6286                                 if (errno_name)
6287                                         fprintf(f, ":%s", errno_name);
6288                                 else
6289                                         fprintf(f, ":%d", num);
6290                         }
6291                 }
6292 #endif
6293
6294                 fputc('\n', f);
6295         }
6296
6297         if (c->syscall_archs) {
6298                 fprintf(f,
6299                         "%sSystemCallArchitectures:",
6300                         prefix);
6301
6302 #if HAVE_SECCOMP
6303                 void *id;
6304                 SET_FOREACH(id, c->syscall_archs)
6305                         fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
6306 #endif
6307                 fputc('\n', f);
6308         }
6309
6310         if (exec_context_restrict_namespaces_set(c)) {
6311                 _cleanup_free_ char *s = NULL;
6312
6313                 r = namespace_flags_to_string(c->restrict_namespaces, &s);
6314                 if (r >= 0)
6315                         fprintf(f, "%sRestrictNamespaces: %s\n",
6316                                 prefix, strna(s));
6317         }
6318
6319 #if HAVE_LIBBPF
6320         if (exec_context_restrict_filesystems_set(c)) {
6321                 char *fs;
6322                 SET_FOREACH(fs, c->restrict_filesystems)
6323                         fprintf(f, "%sRestrictFileSystems: %s\n", prefix, fs);
6324         }
6325 #endif
6326
6327         if (c->network_namespace_path)
6328                 fprintf(f,
6329                         "%sNetworkNamespacePath: %s\n",
6330                         prefix, c->network_namespace_path);
6331
6332         if (c->syscall_errno > 0) {
6333                 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
6334
6335 #if HAVE_SECCOMP
6336                 const char *errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
6337                 if (errno_name)
6338                         fputs(errno_name, f);
6339                 else
6340                         fprintf(f, "%d", c->syscall_errno);
6341 #endif
6342                 fputc('\n', f);
6343         }
6344
6345         for (size_t i = 0; i < c->n_mount_images; i++) {
6346                 fprintf(f, "%sMountImages: %s%s:%s", prefix,
6347                         c->mount_images[i].ignore_enoent ? "-": "",
6348                         c->mount_images[i].source,
6349                         c->mount_images[i].destination);
6350                 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
6351                         fprintf(f, ":%s:%s",
6352                                 partition_designator_to_string(o->partition_designator),
6353                                 strempty(o->options));
6354                 fprintf(f, "\n");
6355         }
6356
6357         for (size_t i = 0; i < c->n_extension_images; i++) {
6358                 fprintf(f, "%sExtensionImages: %s%s", prefix,
6359                         c->extension_images[i].ignore_enoent ? "-": "",
6360                         c->extension_images[i].source);
6361                 LIST_FOREACH(mount_options, o, c->extension_images[i].mount_options)
6362                         fprintf(f, ":%s:%s",
6363                                 partition_designator_to_string(o->partition_designator),
6364                                 strempty(o->options));
6365                 fprintf(f, "\n");
6366         }
6367
6368         strv_dump(f, prefix, "ExtensionDirectories", c->extension_directories);
6369 }
6370
6371 bool exec_context_maintains_privileges(const ExecContext *c) {
6372         assert(c);
6373
6374         /* Returns true if the process forked off would run under
6375          * an unchanged UID or as root. */
6376
6377         if (!c->user)
6378                 return true;
6379
6380         if (streq(c->user, "root") || streq(c->user, "0"))
6381                 return true;
6382
6383         return false;
6384 }
6385
6386 int exec_context_get_effective_ioprio(const ExecContext *c) {
6387         int p;
6388
6389         assert(c);
6390
6391         if (c->ioprio_set)
6392                 return c->ioprio;
6393
6394         p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
6395         if (p < 0)
6396                 return IOPRIO_DEFAULT_CLASS_AND_PRIO;
6397
6398         return ioprio_normalize(p);
6399 }
6400
6401 bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
6402         assert(c);
6403
6404         /* Explicit setting wins */
6405         if (c->mount_apivfs_set)
6406                 return c->mount_apivfs;
6407
6408         /* Default to "yes" if root directory or image are specified */
6409         if (exec_context_with_rootfs(c))
6410                 return true;
6411
6412         return false;
6413 }
6414
6415 void exec_context_free_log_extra_fields(ExecContext *c) {
6416         assert(c);
6417
6418         for (size_t l = 0; l < c->n_log_extra_fields; l++)
6419                 free(c->log_extra_fields[l].iov_base);
6420         c->log_extra_fields = mfree(c->log_extra_fields);
6421         c->n_log_extra_fields = 0;
6422 }
6423
6424 void exec_context_revert_tty(ExecContext *c) {
6425         _cleanup_close_ int fd = -EBADF;
6426         const char *path;
6427         struct stat st;
6428         int r;
6429
6430         assert(c);
6431
6432         /* First, reset the TTY (possibly kicking everybody else from the TTY) */
6433         exec_context_tty_reset(c, NULL);
6434
6435         /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
6436          * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
6437          * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
6438         if (!exec_context_may_touch_tty(c))
6439                 return;
6440
6441         path = exec_context_tty_path(c);
6442         if (!path)
6443                 return;
6444
6445         fd = open(path, O_PATH|O_CLOEXEC);
6446         if (fd < 0)
6447                 return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
6448                                              "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
6449                                              path);
6450
6451         if (fstat(fd, &st) < 0)
6452                 return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
6453
6454         /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
6455          * if things are a character device, since a proper check either means we'd have to open the TTY and
6456          * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
6457          * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
6458          * with this at all? → https://github.com/systemd/systemd/issues/19213 */
6459         if (!S_ISCHR(st.st_mode))
6460                 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
6461
6462         r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
6463         if (r < 0)
6464                 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
6465 }
6466
6467 int exec_context_get_clean_directories(
6468                 ExecContext *c,
6469                 char **prefix,
6470                 ExecCleanMask mask,
6471                 char ***ret) {
6472
6473         _cleanup_strv_free_ char **l = NULL;
6474         int r;
6475
6476         assert(c);
6477         assert(prefix);
6478         assert(ret);
6479
6480         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6481                 if (!FLAGS_SET(mask, 1U << t))
6482                         continue;
6483
6484                 if (!prefix[t])
6485                         continue;
6486
6487                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
6488                         char *j;
6489
6490                         j = path_join(prefix[t], c->directories[t].items[i].path);
6491                         if (!j)
6492                                 return -ENOMEM;
6493
6494                         r = strv_consume(&l, j);
6495                         if (r < 0)
6496                                 return r;
6497
6498                         /* Also remove private directories unconditionally. */
6499                         if (t != EXEC_DIRECTORY_CONFIGURATION) {
6500                                 j = path_join(prefix[t], "private", c->directories[t].items[i].path);
6501                                 if (!j)
6502                                         return -ENOMEM;
6503
6504                                 r = strv_consume(&l, j);
6505                                 if (r < 0)
6506                                         return r;
6507                         }
6508
6509                         STRV_FOREACH(symlink, c->directories[t].items[i].symlinks) {
6510                                 j = path_join(prefix[t], *symlink);
6511                                 if (!j)
6512                                         return -ENOMEM;
6513
6514                                 r = strv_consume(&l, j);
6515                                 if (r < 0)
6516                                         return r;
6517                         }
6518                 }
6519         }
6520
6521         *ret = TAKE_PTR(l);
6522         return 0;
6523 }
6524
6525 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
6526         ExecCleanMask mask = 0;
6527
6528         assert(c);
6529         assert(ret);
6530
6531         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
6532                 if (c->directories[t].n_items > 0)
6533                         mask |= 1U << t;
6534
6535         *ret = mask;
6536         return 0;
6537 }
6538
6539 void exec_status_start(ExecStatus *s, pid_t pid) {
6540         assert(s);
6541
6542         *s = (ExecStatus) {
6543                 .pid = pid,
6544         };
6545
6546         dual_timestamp_get(&s->start_timestamp);
6547 }
6548
6549 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
6550         assert(s);
6551
6552         if (s->pid != pid)
6553                 *s = (ExecStatus) {
6554                         .pid = pid,
6555                 };
6556
6557         dual_timestamp_get(&s->exit_timestamp);
6558
6559         s->code = code;
6560         s->status = status;
6561
6562         if (context && context->utmp_id)
6563                 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
6564 }
6565
6566 void exec_status_reset(ExecStatus *s) {
6567         assert(s);
6568
6569         *s = (ExecStatus) {};
6570 }
6571
6572 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
6573         assert(s);
6574         assert(f);
6575
6576         if (s->pid <= 0)
6577                 return;
6578
6579         prefix = strempty(prefix);
6580
6581         fprintf(f,
6582                 "%sPID: "PID_FMT"\n",
6583                 prefix, s->pid);
6584
6585         if (dual_timestamp_is_set(&s->start_timestamp))
6586                 fprintf(f,
6587                         "%sStart Timestamp: %s\n",
6588                         prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
6589
6590         if (dual_timestamp_is_set(&s->exit_timestamp))
6591                 fprintf(f,
6592                         "%sExit Timestamp: %s\n"
6593                         "%sExit Code: %s\n"
6594                         "%sExit Status: %i\n",
6595                         prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime),
6596                         prefix, sigchld_code_to_string(s->code),
6597                         prefix, s->status);
6598 }
6599
6600 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
6601         _cleanup_free_ char *cmd = NULL;
6602         const char *prefix2;
6603
6604         assert(c);
6605         assert(f);
6606
6607         prefix = strempty(prefix);
6608         prefix2 = strjoina(prefix, "\t");
6609
6610         cmd = quote_command_line(c->argv, SHELL_ESCAPE_EMPTY);
6611
6612         fprintf(f,
6613                 "%sCommand Line: %s\n",
6614                 prefix, strnull(cmd));
6615
6616         exec_status_dump(&c->exec_status, f, prefix2);
6617 }
6618
6619 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
6620         assert(f);
6621
6622         prefix = strempty(prefix);
6623
6624         LIST_FOREACH(command, i, c)
6625                 exec_command_dump(i, f, prefix);
6626 }
6627
6628 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
6629         ExecCommand *end;
6630
6631         assert(l);
6632         assert(e);
6633
6634         if (*l) {
6635                 /* It's kind of important, that we keep the order here */
6636                 LIST_FIND_TAIL(command, *l, end);
6637                 LIST_INSERT_AFTER(command, *l, end, e);
6638         } else
6639               *l = e;
6640 }
6641
6642 int exec_command_set(ExecCommand *c, const char *path, ...) {
6643         va_list ap;
6644         char **l, *p;
6645
6646         assert(c);
6647         assert(path);
6648
6649         va_start(ap, path);
6650         l = strv_new_ap(path, ap);
6651         va_end(ap);
6652
6653         if (!l)
6654                 return -ENOMEM;
6655
6656         p = strdup(path);
6657         if (!p) {
6658                 strv_free(l);
6659                 return -ENOMEM;
6660         }
6661
6662         free_and_replace(c->path, p);
6663
6664         return strv_free_and_replace(c->argv, l);
6665 }
6666
6667 int exec_command_append(ExecCommand *c, const char *path, ...) {
6668         _cleanup_strv_free_ char **l = NULL;
6669         va_list ap;
6670         int r;
6671
6672         assert(c);
6673         assert(path);
6674
6675         va_start(ap, path);
6676         l = strv_new_ap(path, ap);
6677         va_end(ap);
6678
6679         if (!l)
6680                 return -ENOMEM;
6681
6682         r = strv_extend_strv(&c->argv, l, false);
6683         if (r < 0)
6684                 return r;
6685
6686         return 0;
6687 }
6688
6689 static void *remove_tmpdir_thread(void *p) {
6690         _cleanup_free_ char *path = p;
6691
6692         (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
6693         return NULL;
6694 }
6695
6696 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
6697         int r;
6698
6699         if (!rt)
6700                 return NULL;
6701
6702         if (rt->manager)
6703                 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
6704
6705         /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
6706
6707         if (destroy && rt->tmp_dir && !streq(rt->tmp_dir, RUN_SYSTEMD_EMPTY)) {
6708                 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
6709
6710                 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
6711                 if (r < 0)
6712                         log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
6713                 else
6714                         rt->tmp_dir = NULL;
6715         }
6716
6717         if (destroy && rt->var_tmp_dir && !streq(rt->var_tmp_dir, RUN_SYSTEMD_EMPTY)) {
6718                 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
6719
6720                 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
6721                 if (r < 0)
6722                         log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
6723                 else
6724                         rt->var_tmp_dir = NULL;
6725         }
6726
6727         rt->id = mfree(rt->id);
6728         rt->tmp_dir = mfree(rt->tmp_dir);
6729         rt->var_tmp_dir = mfree(rt->var_tmp_dir);
6730         safe_close_pair(rt->netns_storage_socket);
6731         safe_close_pair(rt->ipcns_storage_socket);
6732         return mfree(rt);
6733 }
6734
6735 static void exec_runtime_freep(ExecRuntime **rt) {
6736         (void) exec_runtime_free(*rt, false);
6737 }
6738
6739 static int exec_runtime_allocate(ExecRuntime **ret, const char *id) {
6740         _cleanup_free_ char *id_copy = NULL;
6741         ExecRuntime *n;
6742
6743         assert(ret);
6744
6745         id_copy = strdup(id);
6746         if (!id_copy)
6747                 return -ENOMEM;
6748
6749         n = new(ExecRuntime, 1);
6750         if (!n)
6751                 return -ENOMEM;
6752
6753         *n = (ExecRuntime) {
6754                 .id = TAKE_PTR(id_copy),
6755                 .netns_storage_socket = PIPE_EBADF,
6756                 .ipcns_storage_socket = PIPE_EBADF,
6757         };
6758
6759         *ret = n;
6760         return 0;
6761 }
6762
6763 static int exec_runtime_add(
6764                 Manager *m,
6765                 const char *id,
6766                 char **tmp_dir,
6767                 char **var_tmp_dir,
6768                 int netns_storage_socket[2],
6769                 int ipcns_storage_socket[2],
6770                 ExecRuntime **ret) {
6771
6772         _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
6773         int r;
6774
6775         assert(m);
6776         assert(id);
6777
6778         /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
6779
6780         r = exec_runtime_allocate(&rt, id);
6781         if (r < 0)
6782                 return r;
6783
6784         r = hashmap_ensure_put(&m->exec_runtime_by_id, &string_hash_ops, rt->id, rt);
6785         if (r < 0)
6786                 return r;
6787
6788         assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
6789         rt->tmp_dir = TAKE_PTR(*tmp_dir);
6790         rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
6791
6792         if (netns_storage_socket) {
6793                 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
6794                 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
6795         }
6796
6797         if (ipcns_storage_socket) {
6798                 rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
6799                 rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
6800         }
6801
6802         rt->manager = m;
6803
6804         if (ret)
6805                 *ret = rt;
6806         /* do not remove created ExecRuntime object when the operation succeeds. */
6807         TAKE_PTR(rt);
6808         return 0;
6809 }
6810
6811 static int exec_runtime_make(
6812                 Manager *m,
6813                 const ExecContext *c,
6814                 const char *id,
6815                 ExecRuntime **ret) {
6816
6817         _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
6818         _cleanup_close_pair_ int netns_storage_socket[2] = PIPE_EBADF, ipcns_storage_socket[2] = PIPE_EBADF;
6819         int r;
6820
6821         assert(m);
6822         assert(c);
6823         assert(id);
6824
6825         /* It is not necessary to create ExecRuntime object. */
6826         if (!c->private_network && !c->private_ipc && !c->private_tmp && !c->network_namespace_path) {
6827                 *ret = NULL;
6828                 return 0;
6829         }
6830
6831         if (c->private_tmp &&
6832             !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
6833               (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
6834                prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
6835                 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
6836                 if (r < 0)
6837                         return r;
6838         }
6839
6840         if (c->private_network || c->network_namespace_path) {
6841                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
6842                         return -errno;
6843         }
6844
6845         if (c->private_ipc || c->ipc_namespace_path) {
6846                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
6847                         return -errno;
6848         }
6849
6850         r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
6851         if (r < 0)
6852                 return r;
6853
6854         return 1;
6855 }
6856
6857 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
6858         ExecRuntime *rt;
6859         int r;
6860
6861         assert(m);
6862         assert(id);
6863         assert(ret);
6864
6865         rt = hashmap_get(m->exec_runtime_by_id, id);
6866         if (rt)
6867                 /* We already have an ExecRuntime object, let's increase the ref count and reuse it */
6868                 goto ref;
6869
6870         if (!create) {
6871                 *ret = NULL;
6872                 return 0;
6873         }
6874
6875         /* If not found, then create a new object. */
6876         r = exec_runtime_make(m, c, id, &rt);
6877         if (r < 0)
6878                 return r;
6879         if (r == 0) {
6880                 /* When r == 0, it is not necessary to create ExecRuntime object. */
6881                 *ret = NULL;
6882                 return 0;
6883         }
6884
6885 ref:
6886         /* increment reference counter. */
6887         rt->n_ref++;
6888         *ret = rt;
6889         return 1;
6890 }
6891
6892 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
6893         if (!rt)
6894                 return NULL;
6895
6896         assert(rt->n_ref > 0);
6897
6898         rt->n_ref--;
6899         if (rt->n_ref > 0)
6900                 return NULL;
6901
6902         return exec_runtime_free(rt, destroy);
6903 }
6904
6905 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
6906         ExecRuntime *rt;
6907
6908         assert(m);
6909         assert(f);
6910         assert(fds);
6911
6912         HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
6913                 fprintf(f, "exec-runtime=%s", rt->id);
6914
6915                 if (rt->tmp_dir)
6916                         fprintf(f, " tmp-dir=%s", rt->tmp_dir);
6917
6918                 if (rt->var_tmp_dir)
6919                         fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
6920
6921                 if (rt->netns_storage_socket[0] >= 0) {
6922                         int copy;
6923
6924                         copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
6925                         if (copy < 0)
6926                                 return copy;
6927
6928                         fprintf(f, " netns-socket-0=%i", copy);
6929                 }
6930
6931                 if (rt->netns_storage_socket[1] >= 0) {
6932                         int copy;
6933
6934                         copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
6935                         if (copy < 0)
6936                                 return copy;
6937
6938                         fprintf(f, " netns-socket-1=%i", copy);
6939                 }
6940
6941                 if (rt->ipcns_storage_socket[0] >= 0) {
6942                         int copy;
6943
6944                         copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
6945                         if (copy < 0)
6946                                 return copy;
6947
6948                         fprintf(f, " ipcns-socket-0=%i", copy);
6949                 }
6950
6951                 if (rt->ipcns_storage_socket[1] >= 0) {
6952                         int copy;
6953
6954                         copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
6955                         if (copy < 0)
6956                                 return copy;
6957
6958                         fprintf(f, " ipcns-socket-1=%i", copy);
6959                 }
6960
6961                 fputc('\n', f);
6962         }
6963
6964         return 0;
6965 }
6966
6967 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
6968         _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
6969         ExecRuntime *rt;
6970         int r;
6971
6972         /* This is for the migration from old (v237 or earlier) deserialization text.
6973          * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
6974          * Even if the ExecRuntime object originally created by the other unit, we cannot judge
6975          * so or not from the serialized text, then we always creates a new object owned by this. */
6976
6977         assert(u);
6978         assert(key);
6979         assert(value);
6980
6981         /* Manager manages ExecRuntime objects by the unit id.
6982          * So, we omit the serialized text when the unit does not have id (yet?)... */
6983         if (isempty(u->id)) {
6984                 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
6985                 return 0;
6986         }
6987
6988         if (hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops) < 0)
6989                 return log_oom();
6990
6991         rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
6992         if (!rt) {
6993                 if (exec_runtime_allocate(&rt_create, u->id) < 0)
6994                         return log_oom();
6995
6996                 rt = rt_create;
6997         }
6998
6999         if (streq(key, "tmp-dir")) {
7000                 if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
7001                         return -ENOMEM;
7002
7003         } else if (streq(key, "var-tmp-dir")) {
7004                 if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
7005                         return -ENOMEM;
7006
7007         } else if (streq(key, "netns-socket-0")) {
7008                 int fd;
7009
7010                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
7011                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
7012                         return 0;
7013                 }
7014
7015                 safe_close(rt->netns_storage_socket[0]);
7016                 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
7017
7018         } else if (streq(key, "netns-socket-1")) {
7019                 int fd;
7020
7021                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
7022                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
7023                         return 0;
7024                 }
7025
7026                 safe_close(rt->netns_storage_socket[1]);
7027                 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
7028
7029         } else
7030                 return 0;
7031
7032         /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
7033         if (rt_create) {
7034                 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
7035                 if (r < 0) {
7036                         log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
7037                         return 0;
7038                 }
7039
7040                 rt_create->manager = u->manager;
7041
7042                 /* Avoid cleanup */
7043                 TAKE_PTR(rt_create);
7044         }
7045
7046         return 1;
7047 }
7048
7049 int exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
7050         _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
7051         char *id = NULL;
7052         int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
7053         const char *p, *v = ASSERT_PTR(value);
7054         size_t n;
7055
7056         assert(m);
7057         assert(fds);
7058
7059         n = strcspn(v, " ");
7060         id = strndupa_safe(v, n);
7061         if (v[n] != ' ')
7062                 goto finalize;
7063         p = v + n + 1;
7064
7065         v = startswith(p, "tmp-dir=");
7066         if (v) {
7067                 n = strcspn(v, " ");
7068                 tmp_dir = strndup(v, n);
7069                 if (!tmp_dir)
7070                         return log_oom();
7071                 if (v[n] != ' ')
7072                         goto finalize;
7073                 p = v + n + 1;
7074         }
7075
7076         v = startswith(p, "var-tmp-dir=");
7077         if (v) {
7078                 n = strcspn(v, " ");
7079                 var_tmp_dir = strndup(v, n);
7080                 if (!var_tmp_dir)
7081                         return log_oom();
7082                 if (v[n] != ' ')
7083                         goto finalize;
7084                 p = v + n + 1;
7085         }
7086
7087         v = startswith(p, "netns-socket-0=");
7088         if (v) {
7089                 char *buf;
7090
7091                 n = strcspn(v, " ");
7092                 buf = strndupa_safe(v, n);
7093
7094                 r = safe_atoi(buf, &netns_fdpair[0]);
7095                 if (r < 0)
7096                         return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
7097                 if (!fdset_contains(fds, netns_fdpair[0]))
7098                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7099                                                "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair[0]);
7100                 netns_fdpair[0] = fdset_remove(fds, netns_fdpair[0]);
7101                 if (v[n] != ' ')
7102                         goto finalize;
7103                 p = v + n + 1;
7104         }
7105
7106         v = startswith(p, "netns-socket-1=");
7107         if (v) {
7108                 char *buf;
7109
7110                 n = strcspn(v, " ");
7111                 buf = strndupa_safe(v, n);
7112
7113                 r = safe_atoi(buf, &netns_fdpair[1]);
7114                 if (r < 0)
7115                         return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
7116                 if (!fdset_contains(fds, netns_fdpair[1]))
7117                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7118                                                "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair[1]);
7119                 netns_fdpair[1] = fdset_remove(fds, netns_fdpair[1]);
7120                 if (v[n] != ' ')
7121                         goto finalize;
7122                 p = v + n + 1;
7123         }
7124
7125         v = startswith(p, "ipcns-socket-0=");
7126         if (v) {
7127                 char *buf;
7128
7129                 n = strcspn(v, " ");
7130                 buf = strndupa_safe(v, n);
7131
7132                 r = safe_atoi(buf, &ipcns_fdpair[0]);
7133                 if (r < 0)
7134                         return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf);
7135                 if (!fdset_contains(fds, ipcns_fdpair[0]))
7136                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7137                                                "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair[0]);
7138                 ipcns_fdpair[0] = fdset_remove(fds, ipcns_fdpair[0]);
7139                 if (v[n] != ' ')
7140                         goto finalize;
7141                 p = v + n + 1;
7142         }
7143
7144         v = startswith(p, "ipcns-socket-1=");
7145         if (v) {
7146                 char *buf;
7147
7148                 n = strcspn(v, " ");
7149                 buf = strndupa_safe(v, n);
7150
7151                 r = safe_atoi(buf, &ipcns_fdpair[1]);
7152                 if (r < 0)
7153                         return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf);
7154                 if (!fdset_contains(fds, ipcns_fdpair[1]))
7155                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7156                                                "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair[1]);
7157                 ipcns_fdpair[1] = fdset_remove(fds, ipcns_fdpair[1]);
7158         }
7159
7160 finalize:
7161         r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
7162         if (r < 0)
7163                 return log_debug_errno(r, "Failed to add exec-runtime: %m");
7164         return 0;
7165 }
7166
7167 void exec_runtime_vacuum(Manager *m) {
7168         ExecRuntime *rt;
7169
7170         assert(m);
7171
7172         /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
7173
7174         HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
7175                 if (rt->n_ref > 0)
7176                         continue;
7177
7178                 (void) exec_runtime_free(rt, false);
7179         }
7180 }
7181
7182 void exec_params_clear(ExecParameters *p) {
7183         if (!p)
7184                 return;
7185
7186         p->environment = strv_free(p->environment);
7187         p->fd_names = strv_free(p->fd_names);
7188         p->fds = mfree(p->fds);
7189         p->exec_fd = safe_close(p->exec_fd);
7190 }
7191
7192 ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) {
7193         if (!sc)
7194                 return NULL;
7195
7196         free(sc->id);
7197         free(sc->data);
7198         return mfree(sc);
7199 }
7200
7201 ExecLoadCredential *exec_load_credential_free(ExecLoadCredential *lc) {
7202         if (!lc)
7203                 return NULL;
7204
7205         free(lc->id);
7206         free(lc->path);
7207         return mfree(lc);
7208 }
7209
7210 void exec_directory_done(ExecDirectory *d) {
7211         if (!d)
7212                 return;
7213
7214         for (size_t i = 0; i < d->n_items; i++) {
7215                 free(d->items[i].path);
7216                 strv_free(d->items[i].symlinks);
7217         }
7218
7219         d->items = mfree(d->items);
7220         d->n_items = 0;
7221         d->mode = 0755;
7222 }
7223
7224 static ExecDirectoryItem *exec_directory_find(ExecDirectory *d, const char *path) {
7225         assert(d);
7226         assert(path);
7227
7228         for (size_t i = 0; i < d->n_items; i++)
7229                 if (path_equal(d->items[i].path, path))
7230                         return &d->items[i];
7231
7232         return NULL;
7233 }
7234
7235 int exec_directory_add(ExecDirectory *d, const char *path, const char *symlink) {
7236         _cleanup_strv_free_ char **s = NULL;
7237         _cleanup_free_ char *p = NULL;
7238         ExecDirectoryItem *existing;
7239         int r;
7240
7241         assert(d);
7242         assert(path);
7243
7244         existing = exec_directory_find(d, path);
7245         if (existing) {
7246                 r = strv_extend(&existing->symlinks, symlink);
7247                 if (r < 0)
7248                         return r;
7249
7250                 return 0; /* existing item is updated */
7251         }
7252
7253         p = strdup(path);
7254         if (!p)
7255                 return -ENOMEM;
7256
7257         if (symlink) {
7258                 s = strv_new(symlink);
7259                 if (!s)
7260                         return -ENOMEM;
7261         }
7262
7263         if (!GREEDY_REALLOC(d->items, d->n_items + 1))
7264                 return -ENOMEM;
7265
7266         d->items[d->n_items++] = (ExecDirectoryItem) {
7267                 .path = TAKE_PTR(p),
7268                 .symlinks = TAKE_PTR(s),
7269         };
7270
7271         return 1; /* new item is added */
7272 }
7273
7274 static int exec_directory_item_compare_func(const ExecDirectoryItem *a, const ExecDirectoryItem *b) {
7275         assert(a);
7276         assert(b);
7277
7278         return path_compare(a->path, b->path);
7279 }
7280
7281 void exec_directory_sort(ExecDirectory *d) {
7282         assert(d);
7283
7284         /* Sort the exec directories to make always parent directories processed at first in
7285          * setup_exec_directory(), e.g., even if StateDirectory=foo/bar foo, we need to create foo at first,
7286          * then foo/bar. Also, set .only_create flag if one of the parent directories is contained in the
7287          * list. See also comments in setup_exec_directory() and issue #24783. */
7288
7289         if (d->n_items <= 1)
7290                 return;
7291
7292         typesafe_qsort(d->items, d->n_items, exec_directory_item_compare_func);
7293
7294         for (size_t i = 1; i < d->n_items; i++)
7295                 for (size_t j = 0; j < i; j++)
7296                         if (path_startswith(d->items[i].path, d->items[j].path)) {
7297                                 d->items[i].only_create = true;
7298                                 break;
7299                         }
7300 }
7301
7302 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_set_credential_hash_ops, char, string_hash_func, string_compare_func, ExecSetCredential, exec_set_credential_free);
7303 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_load_credential_hash_ops, char, string_hash_func, string_compare_func, ExecLoadCredential, exec_load_credential_free);
7304
7305 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
7306         [EXEC_INPUT_NULL] = "null",
7307         [EXEC_INPUT_TTY] = "tty",
7308         [EXEC_INPUT_TTY_FORCE] = "tty-force",
7309         [EXEC_INPUT_TTY_FAIL] = "tty-fail",
7310         [EXEC_INPUT_SOCKET] = "socket",
7311         [EXEC_INPUT_NAMED_FD] = "fd",
7312         [EXEC_INPUT_DATA] = "data",
7313         [EXEC_INPUT_FILE] = "file",
7314 };
7315
7316 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
7317
7318 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
7319         [EXEC_OUTPUT_INHERIT] = "inherit",
7320         [EXEC_OUTPUT_NULL] = "null",
7321         [EXEC_OUTPUT_TTY] = "tty",
7322         [EXEC_OUTPUT_KMSG] = "kmsg",
7323         [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
7324         [EXEC_OUTPUT_JOURNAL] = "journal",
7325         [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
7326         [EXEC_OUTPUT_SOCKET] = "socket",
7327         [EXEC_OUTPUT_NAMED_FD] = "fd",
7328         [EXEC_OUTPUT_FILE] = "file",
7329         [EXEC_OUTPUT_FILE_APPEND] = "append",
7330         [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
7331 };
7332
7333 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
7334
7335 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
7336         [EXEC_UTMP_INIT] = "init",
7337         [EXEC_UTMP_LOGIN] = "login",
7338         [EXEC_UTMP_USER] = "user",
7339 };
7340
7341 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
7342
7343 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
7344         [EXEC_PRESERVE_NO] = "no",
7345         [EXEC_PRESERVE_YES] = "yes",
7346         [EXEC_PRESERVE_RESTART] = "restart",
7347 };
7348
7349 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
7350
7351 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
7352 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7353         [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
7354         [EXEC_DIRECTORY_STATE] = "StateDirectory",
7355         [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
7356         [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
7357         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
7358 };
7359
7360 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
7361
7362 /* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
7363 static const char* const exec_directory_type_symlink_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7364         [EXEC_DIRECTORY_RUNTIME]       = "RuntimeDirectorySymlink",
7365         [EXEC_DIRECTORY_STATE]         = "StateDirectorySymlink",
7366         [EXEC_DIRECTORY_CACHE]         = "CacheDirectorySymlink",
7367         [EXEC_DIRECTORY_LOGS]          = "LogsDirectorySymlink",
7368         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectorySymlink",
7369 };
7370
7371 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink, ExecDirectoryType);
7372
7373 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
7374  * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
7375  * directories, specifically .timer units with their timestamp touch file. */
7376 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7377         [EXEC_DIRECTORY_RUNTIME] = "runtime",
7378         [EXEC_DIRECTORY_STATE] = "state",
7379         [EXEC_DIRECTORY_CACHE] = "cache",
7380         [EXEC_DIRECTORY_LOGS] = "logs",
7381         [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
7382 };
7383
7384 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
7385
7386 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
7387  * the service payload in. */
7388 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7389         [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
7390         [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
7391         [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
7392         [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
7393         [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
7394 };
7395
7396 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
7397
7398 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
7399         [EXEC_KEYRING_INHERIT] = "inherit",
7400         [EXEC_KEYRING_PRIVATE] = "private",
7401         [EXEC_KEYRING_SHARED] = "shared",
7402 };
7403
7404 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);