src/core/execute.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <errno.h>
   4 #include <fcntl.h>
   5 #include <poll.h>
   6 #include <sys/eventfd.h>
   7 #include <sys/file.h>
   8 #include <sys/ioctl.h>
   9 #include <sys/mman.h>
  10 #include <sys/personality.h>
  11 #include <sys/prctl.h>
  12 #include <sys/shm.h>
  13 #include <sys/types.h>
  14 #include <sys/un.h>
  15 #include <unistd.h>
  16 #include <utmpx.h>
  17
  18 #include <linux/fs.h> /* Must be included after <sys/mount.h> */
  19
  20 #if HAVE_PAM
  21 #include <security/pam_appl.h>
  22 #endif
  23
  24 #if HAVE_SELINUX
  25 #include <selinux/selinux.h>
  26 #endif
  27
  28 #if HAVE_APPARMOR
  29 #include <sys/apparmor.h>
  30 #endif
  31
  32 #include "sd-messages.h"
  33
  34 #include "af-list.h"
  35 #include "alloc-util.h"
  36 #if HAVE_APPARMOR
  37 #include "apparmor-util.h"
  38 #endif
  39 #include "argv-util.h"
  40 #include "async.h"
  41 #include "barrier.h"
  42 #include "bpf-lsm.h"
  43 #include "btrfs-util.h"
  44 #include "cap-list.h"
  45 #include "capability-util.h"
  46 #include "chattr-util.h"
  47 #include "cgroup-setup.h"
  48 #include "chase.h"
  49 #include "chown-recursive.h"
  50 #include "constants.h"
  51 #include "cpu-set-util.h"
  52 #include "data-fd-util.h"
  53 #include "env-file.h"
  54 #include "env-util.h"
  55 #include "errno-list.h"
  56 #include "escape.h"
  57 #include "exec-credential.h"
  58 #include "execute.h"
  59 #include "exit-status.h"
  60 #include "fd-util.h"
  61 #include "format-util.h"
  62 #include "glob-util.h"
  63 #include "hexdecoct.h"
  64 #include "io-util.h"
  65 #include "ioprio-util.h"
  66 #include "lock-util.h"
  67 #include "log.h"
  68 #include "macro.h"
  69 #include "manager.h"
  70 #include "manager-dump.h"
  71 #include "memory-util.h"
  72 #include "missing_fs.h"
  73 #include "missing_ioprio.h"
  74 #include "missing_prctl.h"
  75 #include "mkdir-label.h"
  76 #include "namespace.h"
  77 #include "parse-util.h"
  78 #include "path-util.h"
  79 #include "proc-cmdline.h"
  80 #include "process-util.h"
  81 #include "psi-util.h"
  82 #include "rlimit-util.h"
  83 #include "rm-rf.h"
  84 #include "seccomp-util.h"
  85 #include "securebits-util.h"
  86 #include "selinux-util.h"
  87 #include "signal-util.h"
  88 #include "smack-util.h"
  89 #include "socket-util.h"
  90 #include "sort-util.h"
  91 #include "special.h"
  92 #include "stat-util.h"
  93 #include "string-table.h"
  94 #include "string-util.h"
  95 #include "strv.h"
  96 #include "syslog-util.h"
  97 #include "terminal-util.h"
  98 #include "tmpfile-util.h"
  99 #include "umask-util.h"
 100 #include "unit-serialize.h"
 101 #include "user-util.h"
 102 #include "utmp-wtmp.h"
 103
 104 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
 105 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
 106
 107 #define SNDBUF_SIZE (8*1024*1024)
 108
 109 static int shift_fds(int fds[], size_t n_fds) {
 110         if (n_fds <= 0)
 111                 return 0;
 112
 113         /* Modifies the fds array! (sorts it) */
 114
 115         assert(fds);
 116
 117         for (int start = 0;;) {
 118                 int restart_from = -1;
 119
 120                 for (int i = start; i < (int) n_fds; i++) {
 121                         int nfd;
 122
 123                         /* Already at right index? */
 124                         if (fds[i] == i+3)
 125                                 continue;
 126
 127                         nfd = fcntl(fds[i], F_DUPFD, i + 3);
 128                         if (nfd < 0)
 129                                 return -errno;
 130
 131                         safe_close(fds[i]);
 132                         fds[i] = nfd;
 133
 134                         /* Hmm, the fd we wanted isn't free? Then
 135                          * let's remember that and try again from here */
 136                         if (nfd != i+3 && restart_from < 0)
 137                                 restart_from = i;
 138                 }
 139
 140                 if (restart_from < 0)
 141                         break;
 142
 143                 start = restart_from;
 144         }
 145
 146         return 0;
 147 }
 148
 149 static int flags_fds(
 150                 const int fds[],
 151                 size_t n_socket_fds,
 152                 size_t n_fds,
 153                 bool nonblock) {
 154
 155         int r;
 156
 157         if (n_fds <= 0)
 158                 return 0;
 159
 160         assert(fds);
 161
 162         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
 163          * O_NONBLOCK only applies to socket activation though. */
 164
 165         for (size_t i = 0; i < n_fds; i++) {
 166
 167                 if (i < n_socket_fds) {
 168                         r = fd_nonblock(fds[i], nonblock);
 169                         if (r < 0)
 170                                 return r;
 171                 }
 172
 173                 /* We unconditionally drop FD_CLOEXEC from the fds,
 174                  * since after all we want to pass these fds to our
 175                  * children */
 176
 177                 r = fd_cloexec(fds[i], false);
 178                 if (r < 0)
 179                         return r;
 180         }
 181
 182         return 0;
 183 }
 184
 185 static const char *exec_context_tty_path(const ExecContext *context) {
 186         assert(context);
 187
 188         if (context->stdio_as_fds)
 189                 return NULL;
 190
 191         if (context->tty_path)
 192                 return context->tty_path;
 193
 194         return "/dev/console";
 195 }
 196
 197 static int exec_context_tty_size(const ExecContext *context, unsigned *ret_rows, unsigned *ret_cols) {
 198         unsigned rows, cols;
 199         const char *tty;
 200
 201         assert(context);
 202         assert(ret_rows);
 203         assert(ret_cols);
 204
 205         rows = context->tty_rows;
 206         cols = context->tty_cols;
 207
 208         tty = exec_context_tty_path(context);
 209         if (tty)
 210                 (void) proc_cmdline_tty_size(tty, rows == UINT_MAX ? &rows : NULL, cols == UINT_MAX ? &cols : NULL);
 211
 212         *ret_rows = rows;
 213         *ret_cols = cols;
 214
 215         return 0;
 216 }
 217
 218 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
 219         _cleanup_close_ int fd = -EBADF;
 220         const char *path = exec_context_tty_path(ASSERT_PTR(context));
 221
 222         /* Take a lock around the device for the duration of the setup that we do here.
 223          * systemd-vconsole-setup.service also takes the lock to avoid being interrupted.
 224          * We open a new fd that will be closed automatically, and operate on it for convenience.
 225          */
 226
 227         if (p && p->stdin_fd >= 0) {
 228                 fd = xopenat_lock(p->stdin_fd, NULL,
 229                                   O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY, 0, 0, LOCK_BSD, LOCK_EX);
 230                 if (fd < 0)
 231                         return;
 232         } else if (path) {
 233                 fd = open_terminal(path, O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK);
 234                 if (fd < 0)
 235                         return;
 236
 237                 if (lock_generic(fd, LOCK_BSD, LOCK_EX) < 0)
 238                         return;
 239         } else
 240                 return;   /* nothing to do */
 241
 242         if (context->tty_vhangup)
 243                 (void) terminal_vhangup_fd(fd);
 244
 245         if (context->tty_reset)
 246                 (void) reset_terminal_fd(fd, true);
 247
 248         if (p && p->stdin_fd >= 0) {
 249                 unsigned rows = context->tty_rows, cols = context->tty_cols;
 250
 251                 (void) exec_context_tty_size(context, &rows, &cols);
 252                 (void) terminal_set_size_fd(p->stdin_fd, path, rows, cols);
 253         }
 254
 255         if (context->tty_vt_disallocate && path)
 256                 (void) vt_disallocate(path);
 257 }
 258
 259 static bool is_terminal_input(ExecInput i) {
 260         return IN_SET(i,
 261                       EXEC_INPUT_TTY,
 262                       EXEC_INPUT_TTY_FORCE,
 263                       EXEC_INPUT_TTY_FAIL);
 264 }
 265
 266 static bool is_terminal_output(ExecOutput o) {
 267         return IN_SET(o,
 268                       EXEC_OUTPUT_TTY,
 269                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 270                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 271 }
 272
 273 static bool is_kmsg_output(ExecOutput o) {
 274         return IN_SET(o,
 275                       EXEC_OUTPUT_KMSG,
 276                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 277 }
 278
 279 static bool exec_context_needs_term(const ExecContext *c) {
 280         assert(c);
 281
 282         /* Return true if the execution context suggests we should set $TERM to something useful. */
 283
 284         if (is_terminal_input(c->std_input))
 285                 return true;
 286
 287         if (is_terminal_output(c->std_output))
 288                 return true;
 289
 290         if (is_terminal_output(c->std_error))
 291                 return true;
 292
 293         return !!c->tty_path;
 294 }
 295
 296 static int open_null_as(int flags, int nfd) {
 297         int fd;
 298
 299         assert(nfd >= 0);
 300
 301         fd = open("/dev/null", flags|O_NOCTTY);
 302         if (fd < 0)
 303                 return -errno;
 304
 305         return move_fd(fd, nfd, false);
 306 }
 307
 308 static int connect_journal_socket(
 309                 int fd,
 310                 const char *log_namespace,
 311                 uid_t uid,
 312                 gid_t gid) {
 313
 314         uid_t olduid = UID_INVALID;
 315         gid_t oldgid = GID_INVALID;
 316         const char *j;
 317         int r;
 318
 319         j = log_namespace ?
 320                 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
 321                 "/run/systemd/journal/stdout";
 322
 323         if (gid_is_valid(gid)) {
 324                 oldgid = getgid();
 325
 326                 if (setegid(gid) < 0)
 327                         return -errno;
 328         }
 329
 330         if (uid_is_valid(uid)) {
 331                 olduid = getuid();
 332
 333                 if (seteuid(uid) < 0) {
 334                         r = -errno;
 335                         goto restore_gid;
 336                 }
 337         }
 338
 339         r = connect_unix_path(fd, AT_FDCWD, j);
 340
 341         /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
 342            an LSM interferes. */
 343
 344         if (uid_is_valid(uid))
 345                 (void) seteuid(olduid);
 346
 347  restore_gid:
 348         if (gid_is_valid(gid))
 349                 (void) setegid(oldgid);
 350
 351         return r;
 352 }
 353
 354 static int connect_logger_as(
 355                 const Unit *unit,
 356                 const ExecContext *context,
 357                 const ExecParameters *params,
 358                 ExecOutput output,
 359                 const char *ident,
 360                 int nfd,
 361                 uid_t uid,
 362                 gid_t gid) {
 363
 364         _cleanup_close_ int fd = -EBADF;
 365         int r;
 366
 367         assert(context);
 368         assert(params);
 369         assert(output < _EXEC_OUTPUT_MAX);
 370         assert(ident);
 371         assert(nfd >= 0);
 372
 373         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 374         if (fd < 0)
 375                 return -errno;
 376
 377         r = connect_journal_socket(fd, context->log_namespace, uid, gid);
 378         if (r < 0)
 379                 return r;
 380
 381         if (shutdown(fd, SHUT_RD) < 0)
 382                 return -errno;
 383
 384         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 385
 386         if (dprintf(fd,
 387                 "%s\n"
 388                 "%s\n"
 389                 "%i\n"
 390                 "%i\n"
 391                 "%i\n"
 392                 "%i\n"
 393                 "%i\n",
 394                 context->syslog_identifier ?: ident,
 395                 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
 396                 context->syslog_priority,
 397                 !!context->syslog_level_prefix,
 398                 false,
 399                 is_kmsg_output(output),
 400                 is_terminal_output(output)) < 0)
 401                 return -errno;
 402
 403         return move_fd(TAKE_FD(fd), nfd, false);
 404 }
 405
 406 static int open_terminal_as(const char *path, int flags, int nfd) {
 407         int fd;
 408
 409         assert(path);
 410         assert(nfd >= 0);
 411
 412         fd = open_terminal(path, flags | O_NOCTTY);
 413         if (fd < 0)
 414                 return fd;
 415
 416         return move_fd(fd, nfd, false);
 417 }
 418
 419 static int acquire_path(const char *path, int flags, mode_t mode) {
 420         _cleanup_close_ int fd = -EBADF;
 421         int r;
 422
 423         assert(path);
 424
 425         if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
 426                 flags |= O_CREAT;
 427
 428         fd = open(path, flags|O_NOCTTY, mode);
 429         if (fd >= 0)
 430                 return TAKE_FD(fd);
 431
 432         if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
 433                 return -errno;
 434
 435         /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
 436
 437         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 438         if (fd < 0)
 439                 return -errno;
 440
 441         r = connect_unix_path(fd, AT_FDCWD, path);
 442         if (IN_SET(r, -ENOTSOCK, -EINVAL))
 443                 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
 444                  * wasn't an AF_UNIX socket after all */
 445                 return -ENXIO;
 446         if (r < 0)
 447                 return r;
 448
 449         if ((flags & O_ACCMODE) == O_RDONLY)
 450                 r = shutdown(fd, SHUT_WR);
 451         else if ((flags & O_ACCMODE) == O_WRONLY)
 452                 r = shutdown(fd, SHUT_RD);
 453         else
 454                 r = 0;
 455         if (r < 0)
 456                 return -errno;
 457
 458         return TAKE_FD(fd);
 459 }
 460
 461 static int fixup_input(
 462                 const ExecContext *context,
 463                 int socket_fd,
 464                 bool apply_tty_stdin) {
 465
 466         ExecInput std_input;
 467
 468         assert(context);
 469
 470         std_input = context->std_input;
 471
 472         if (is_terminal_input(std_input) && !apply_tty_stdin)
 473                 return EXEC_INPUT_NULL;
 474
 475         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 476                 return EXEC_INPUT_NULL;
 477
 478         if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
 479                 return EXEC_INPUT_NULL;
 480
 481         return std_input;
 482 }
 483
 484 static int fixup_output(ExecOutput output, int socket_fd) {
 485
 486         if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 487                 return EXEC_OUTPUT_INHERIT;
 488
 489         return output;
 490 }
 491
 492 static int setup_input(
 493                 const ExecContext *context,
 494                 const ExecParameters *params,
 495                 int socket_fd,
 496                 const int named_iofds[static 3]) {
 497
 498         ExecInput i;
 499         int r;
 500
 501         assert(context);
 502         assert(params);
 503         assert(named_iofds);
 504
 505         if (params->stdin_fd >= 0) {
 506                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 507                         return -errno;
 508
 509                 /* Try to make this the controlling tty, if it is a tty, and reset it */
 510                 if (isatty(STDIN_FILENO)) {
 511                         unsigned rows = context->tty_rows, cols = context->tty_cols;
 512
 513                         (void) exec_context_tty_size(context, &rows, &cols);
 514                         (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 515                         (void) reset_terminal_fd(STDIN_FILENO, true);
 516                         (void) terminal_set_size_fd(STDIN_FILENO, NULL, rows, cols);
 517                 }
 518
 519                 return STDIN_FILENO;
 520         }
 521
 522         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 523
 524         switch (i) {
 525
 526         case EXEC_INPUT_NULL:
 527                 return open_null_as(O_RDONLY, STDIN_FILENO);
 528
 529         case EXEC_INPUT_TTY:
 530         case EXEC_INPUT_TTY_FORCE:
 531         case EXEC_INPUT_TTY_FAIL: {
 532                 unsigned rows, cols;
 533                 int fd;
 534
 535                 fd = acquire_terminal(exec_context_tty_path(context),
 536                                       i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
 537                                       i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
 538                                                                   ACQUIRE_TERMINAL_WAIT,
 539                                       USEC_INFINITY);
 540                 if (fd < 0)
 541                         return fd;
 542
 543                 r = exec_context_tty_size(context, &rows, &cols);
 544                 if (r < 0)
 545                         return r;
 546
 547                 r = terminal_set_size_fd(fd, exec_context_tty_path(context), rows, cols);
 548                 if (r < 0)
 549                         return r;
 550
 551                 return move_fd(fd, STDIN_FILENO, false);
 552         }
 553
 554         case EXEC_INPUT_SOCKET:
 555                 assert(socket_fd >= 0);
 556
 557                 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
 558
 559         case EXEC_INPUT_NAMED_FD:
 560                 assert(named_iofds[STDIN_FILENO] >= 0);
 561
 562                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 563                 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
 564
 565         case EXEC_INPUT_DATA: {
 566                 int fd;
 567
 568                 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
 569                 if (fd < 0)
 570                         return fd;
 571
 572                 return move_fd(fd, STDIN_FILENO, false);
 573         }
 574
 575         case EXEC_INPUT_FILE: {
 576                 bool rw;
 577                 int fd;
 578
 579                 assert(context->stdio_file[STDIN_FILENO]);
 580
 581                 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
 582                         (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
 583
 584                 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
 585                 if (fd < 0)
 586                         return fd;
 587
 588                 return move_fd(fd, STDIN_FILENO, false);
 589         }
 590
 591         default:
 592                 assert_not_reached();
 593         }
 594 }
 595
 596 static bool can_inherit_stderr_from_stdout(
 597                 const ExecContext *context,
 598                 ExecOutput o,
 599                 ExecOutput e) {
 600
 601         assert(context);
 602
 603         /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
 604          * stderr fd */
 605
 606         if (e == EXEC_OUTPUT_INHERIT)
 607                 return true;
 608         if (e != o)
 609                 return false;
 610
 611         if (e == EXEC_OUTPUT_NAMED_FD)
 612                 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
 613
 614         if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
 615                 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
 616
 617         return true;
 618 }
 619
 620 static int setup_output(
 621                 const Unit *unit,
 622                 const ExecContext *context,
 623                 const ExecParameters *params,
 624                 int fileno,
 625                 int socket_fd,
 626                 const int named_iofds[static 3],
 627                 const char *ident,
 628                 uid_t uid,
 629                 gid_t gid,
 630                 dev_t *journal_stream_dev,
 631                 ino_t *journal_stream_ino) {
 632
 633         ExecOutput o;
 634         ExecInput i;
 635         int r;
 636
 637         assert(unit);
 638         assert(context);
 639         assert(params);
 640         assert(ident);
 641         assert(journal_stream_dev);
 642         assert(journal_stream_ino);
 643
 644         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 645
 646                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 647                         return -errno;
 648
 649                 return STDOUT_FILENO;
 650         }
 651
 652         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 653                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 654                         return -errno;
 655
 656                 return STDERR_FILENO;
 657         }
 658
 659         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 660         o = fixup_output(context->std_output, socket_fd);
 661
 662         if (fileno == STDERR_FILENO) {
 663                 ExecOutput e;
 664                 e = fixup_output(context->std_error, socket_fd);
 665
 666                 /* This expects the input and output are already set up */
 667
 668                 /* Don't change the stderr file descriptor if we inherit all
 669                  * the way and are not on a tty */
 670                 if (e == EXEC_OUTPUT_INHERIT &&
 671                     o == EXEC_OUTPUT_INHERIT &&
 672                     i == EXEC_INPUT_NULL &&
 673                     !is_terminal_input(context->std_input) &&
 674                     getppid() != 1)
 675                         return fileno;
 676
 677                 /* Duplicate from stdout if possible */
 678                 if (can_inherit_stderr_from_stdout(context, o, e))
 679                         return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
 680
 681                 o = e;
 682
 683         } else if (o == EXEC_OUTPUT_INHERIT) {
 684                 /* If input got downgraded, inherit the original value */
 685                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 686                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 687
 688                 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
 689                 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
 690                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 691
 692                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 693                 if (getppid() != 1)
 694                         return fileno;
 695
 696                 /* We need to open /dev/null here anew, to get the right access mode. */
 697                 return open_null_as(O_WRONLY, fileno);
 698         }
 699
 700         switch (o) {
 701
 702         case EXEC_OUTPUT_NULL:
 703                 return open_null_as(O_WRONLY, fileno);
 704
 705         case EXEC_OUTPUT_TTY:
 706                 if (is_terminal_input(i))
 707                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 708
 709                 /* We don't reset the terminal if this is just about output */
 710                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 711
 712         case EXEC_OUTPUT_KMSG:
 713         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 714         case EXEC_OUTPUT_JOURNAL:
 715         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 716                 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
 717                 if (r < 0) {
 718                         log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m",
 719                                                fileno == STDOUT_FILENO ? "stdout" : "stderr");
 720                         r = open_null_as(O_WRONLY, fileno);
 721                 } else {
 722                         struct stat st;
 723
 724                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 725                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 726                          * services to detect whether they are connected to the journal or not.
 727                          *
 728                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
 729                          * about STDERR as that's usually the best way to do logging. */
 730
 731                         if (fstat(fileno, &st) >= 0 &&
 732                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
 733                                 *journal_stream_dev = st.st_dev;
 734                                 *journal_stream_ino = st.st_ino;
 735                         }
 736                 }
 737                 return r;
 738
 739         case EXEC_OUTPUT_SOCKET:
 740                 assert(socket_fd >= 0);
 741
 742                 return RET_NERRNO(dup2(socket_fd, fileno));
 743
 744         case EXEC_OUTPUT_NAMED_FD:
 745                 assert(named_iofds[fileno] >= 0);
 746
 747                 (void) fd_nonblock(named_iofds[fileno], false);
 748                 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
 749
 750         case EXEC_OUTPUT_FILE:
 751         case EXEC_OUTPUT_FILE_APPEND:
 752         case EXEC_OUTPUT_FILE_TRUNCATE: {
 753                 bool rw;
 754                 int fd, flags;
 755
 756                 assert(context->stdio_file[fileno]);
 757
 758                 rw = context->std_input == EXEC_INPUT_FILE &&
 759                         streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
 760
 761                 if (rw)
 762                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 763
 764                 flags = O_WRONLY;
 765                 if (o == EXEC_OUTPUT_FILE_APPEND)
 766                         flags |= O_APPEND;
 767                 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
 768                         flags |= O_TRUNC;
 769
 770                 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
 771                 if (fd < 0)
 772                         return fd;
 773
 774                 return move_fd(fd, fileno, 0);
 775         }
 776
 777         default:
 778                 assert_not_reached();
 779         }
 780 }
 781
 782 static int chown_terminal(int fd, uid_t uid) {
 783         int r;
 784
 785         assert(fd >= 0);
 786
 787         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 788         if (isatty(fd) < 1) {
 789                 if (IN_SET(errno, EINVAL, ENOTTY))
 790                         return 0; /* not a tty */
 791
 792                 return -errno;
 793         }
 794
 795         /* This might fail. What matters are the results. */
 796         r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
 797         if (r < 0)
 798                 return r;
 799
 800         return 1;
 801 }
 802
 803 static int setup_confirm_stdio(
 804                 const ExecContext *context,
 805                 const char *vc,
 806                 int *ret_saved_stdin,
 807                 int *ret_saved_stdout) {
 808
 809         _cleanup_close_ int fd = -EBADF, saved_stdin = -EBADF, saved_stdout = -EBADF;
 810         unsigned rows, cols;
 811         int r;
 812
 813         assert(ret_saved_stdin);
 814         assert(ret_saved_stdout);
 815
 816         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 817         if (saved_stdin < 0)
 818                 return -errno;
 819
 820         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
 821         if (saved_stdout < 0)
 822                 return -errno;
 823
 824         fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
 825         if (fd < 0)
 826                 return fd;
 827
 828         r = chown_terminal(fd, getuid());
 829         if (r < 0)
 830                 return r;
 831
 832         r = reset_terminal_fd(fd, true);
 833         if (r < 0)
 834                 return r;
 835
 836         r = exec_context_tty_size(context, &rows, &cols);
 837         if (r < 0)
 838                 return r;
 839
 840         r = terminal_set_size_fd(fd, vc, rows, cols);
 841         if (r < 0)
 842                 return r;
 843
 844         r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
 845         TAKE_FD(fd);
 846         if (r < 0)
 847                 return r;
 848
 849         *ret_saved_stdin = TAKE_FD(saved_stdin);
 850         *ret_saved_stdout = TAKE_FD(saved_stdout);
 851         return 0;
 852 }
 853
 854 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
 855         assert(err < 0);
 856
 857         if (err == -ETIMEDOUT)
 858                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
 859         else {
 860                 errno = -err;
 861                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
 862         }
 863 }
 864
 865 static void write_confirm_error(int err, const char *vc, const Unit *u) {
 866         _cleanup_close_ int fd = -EBADF;
 867
 868         assert(vc);
 869
 870         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 871         if (fd < 0)
 872                 return;
 873
 874         write_confirm_error_fd(err, fd, u);
 875 }
 876
 877 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 878         int r = 0;
 879
 880         assert(saved_stdin);
 881         assert(saved_stdout);
 882
 883         release_terminal();
 884
 885         if (*saved_stdin >= 0)
 886                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 887                         r = -errno;
 888
 889         if (*saved_stdout >= 0)
 890                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 891                         r = -errno;
 892
 893         *saved_stdin = safe_close(*saved_stdin);
 894         *saved_stdout = safe_close(*saved_stdout);
 895
 896         return r;
 897 }
 898
 899 enum {
 900         CONFIRM_PRETEND_FAILURE = -1,
 901         CONFIRM_PRETEND_SUCCESS =  0,
 902         CONFIRM_EXECUTE = 1,
 903 };
 904
 905 static int ask_for_confirmation(const ExecContext *context, const char *vc, Unit *u, const char *cmdline) {
 906         int saved_stdout = -1, saved_stdin = -1, r;
 907         _cleanup_free_ char *e = NULL;
 908         char c;
 909
 910         /* For any internal errors, assume a positive response. */
 911         r = setup_confirm_stdio(context, vc, &saved_stdin, &saved_stdout);
 912         if (r < 0) {
 913                 write_confirm_error(r, vc, u);
 914                 return CONFIRM_EXECUTE;
 915         }
 916
 917         /* confirm_spawn might have been disabled while we were sleeping. */
 918         if (manager_is_confirm_spawn_disabled(u->manager)) {
 919                 r = 1;
 920                 goto restore_stdio;
 921         }
 922
 923         e = ellipsize(cmdline, 60, 100);
 924         if (!e) {
 925                 log_oom();
 926                 r = CONFIRM_EXECUTE;
 927                 goto restore_stdio;
 928         }
 929
 930         for (;;) {
 931                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 932                 if (r < 0) {
 933                         write_confirm_error_fd(r, STDOUT_FILENO, u);
 934                         r = CONFIRM_EXECUTE;
 935                         goto restore_stdio;
 936                 }
 937
 938                 switch (c) {
 939                 case 'c':
 940                         printf("Resuming normal execution.\n");
 941                         manager_disable_confirm_spawn();
 942                         r = 1;
 943                         break;
 944                 case 'D':
 945                         unit_dump(u, stdout, "  ");
 946                         continue; /* ask again */
 947                 case 'f':
 948                         printf("Failing execution.\n");
 949                         r = CONFIRM_PRETEND_FAILURE;
 950                         break;
 951                 case 'h':
 952                         printf("  c - continue, proceed without asking anymore\n"
 953                                "  D - dump, show the state of the unit\n"
 954                                "  f - fail, don't execute the command and pretend it failed\n"
 955                                "  h - help\n"
 956                                "  i - info, show a short summary of the unit\n"
 957                                "  j - jobs, show jobs that are in progress\n"
 958                                "  s - skip, don't execute the command and pretend it succeeded\n"
 959                                "  y - yes, execute the command\n");
 960                         continue; /* ask again */
 961                 case 'i':
 962                         printf("  Description: %s\n"
 963                                "  Unit:        %s\n"
 964                                "  Command:     %s\n",
 965                                u->id, u->description, cmdline);
 966                         continue; /* ask again */
 967                 case 'j':
 968                         manager_dump_jobs(u->manager, stdout, /* patterns= */ NULL, "  ");
 969                         continue; /* ask again */
 970                 case 'n':
 971                         /* 'n' was removed in favor of 'f'. */
 972                         printf("Didn't understand 'n', did you mean 'f'?\n");
 973                         continue; /* ask again */
 974                 case 's':
 975                         printf("Skipping execution.\n");
 976                         r = CONFIRM_PRETEND_SUCCESS;
 977                         break;
 978                 case 'y':
 979                         r = CONFIRM_EXECUTE;
 980                         break;
 981                 default:
 982                         assert_not_reached();
 983                 }
 984                 break;
 985         }
 986
 987 restore_stdio:
 988         restore_confirm_stdio(&saved_stdin, &saved_stdout);
 989         return r;
 990 }
 991
 992 static int get_fixed_user(
 993                 const char *username,
 994                 const char **ret_user,
 995                 uid_t *ret_uid,
 996                 gid_t *ret_gid,
 997                 const char **ret_home,
 998                 const char **ret_shell) {
 999
1000         int r;
1001
1002         assert(username);
1003         assert(ret_user);
1004
1005         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
1006          * (i.e. are "/" or "/bin/nologin"). */
1007
1008         r = get_user_creds(&username, ret_uid, ret_gid, ret_home, ret_shell, USER_CREDS_CLEAN);
1009         if (r < 0)
1010                 return r;
1011
1012         *ret_user = username;
1013         return 0;
1014 }
1015
1016 static int get_fixed_group(
1017                 const char *groupname,
1018                 const char **ret_group,
1019                 gid_t *ret_gid) {
1020
1021         int r;
1022
1023         assert(groupname);
1024         assert(ret_group);
1025
1026         r = get_group_creds(&groupname, ret_gid, /* flags = */ 0);
1027         if (r < 0)
1028                 return r;
1029
1030         *ret_group = groupname;
1031         return 0;
1032 }
1033
1034 static int get_supplementary_groups(const ExecContext *c, const char *user,
1035                                     const char *group, gid_t gid,
1036                                     gid_t **supplementary_gids, int *ngids) {
1037         int r, k = 0;
1038         int ngroups_max;
1039         bool keep_groups = false;
1040         gid_t *groups = NULL;
1041         _cleanup_free_ gid_t *l_gids = NULL;
1042
1043         assert(c);
1044
1045         /*
1046          * If user is given, then lookup GID and supplementary groups list.
1047          * We avoid NSS lookups for gid=0. Also we have to initialize groups
1048          * here and as early as possible so we keep the list of supplementary
1049          * groups of the caller.
1050          */
1051         if (user && gid_is_valid(gid) && gid != 0) {
1052                 /* First step, initialize groups from /etc/groups */
1053                 if (initgroups(user, gid) < 0)
1054                         return -errno;
1055
1056                 keep_groups = true;
1057         }
1058
1059         if (strv_isempty(c->supplementary_groups))
1060                 return 0;
1061
1062         /*
1063          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1064          * be positive, otherwise fail.
1065          */
1066         errno = 0;
1067         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1068         if (ngroups_max <= 0)
1069                 return errno_or_else(EOPNOTSUPP);
1070
1071         l_gids = new(gid_t, ngroups_max);
1072         if (!l_gids)
1073                 return -ENOMEM;
1074
1075         if (keep_groups) {
1076                 /*
1077                  * Lookup the list of groups that the user belongs to, we
1078                  * avoid NSS lookups here too for gid=0.
1079                  */
1080                 k = ngroups_max;
1081                 if (getgrouplist(user, gid, l_gids, &k) < 0)
1082                         return -EINVAL;
1083         } else
1084                 k = 0;
1085
1086         STRV_FOREACH(i, c->supplementary_groups) {
1087                 const char *g;
1088
1089                 if (k >= ngroups_max)
1090                         return -E2BIG;
1091
1092                 g = *i;
1093                 r = get_group_creds(&g, l_gids+k, 0);
1094                 if (r < 0)
1095                         return r;
1096
1097                 k++;
1098         }
1099
1100         /*
1101          * Sets ngids to zero to drop all supplementary groups, happens
1102          * when we are under root and SupplementaryGroups= is empty.
1103          */
1104         if (k == 0) {
1105                 *ngids = 0;
1106                 return 0;
1107         }
1108
1109         /* Otherwise get the final list of supplementary groups */
1110         groups = memdup(l_gids, sizeof(gid_t) * k);
1111         if (!groups)
1112                 return -ENOMEM;
1113
1114         *supplementary_gids = groups;
1115         *ngids = k;
1116
1117         groups = NULL;
1118
1119         return 0;
1120 }
1121
1122 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1123         int r;
1124
1125         /* Handle SupplementaryGroups= if it is not empty */
1126         if (ngids > 0) {
1127                 r = maybe_setgroups(ngids, supplementary_gids);
1128                 if (r < 0)
1129                         return r;
1130         }
1131
1132         if (gid_is_valid(gid)) {
1133                 /* Then set our gids */
1134                 if (setresgid(gid, gid, gid) < 0)
1135                         return -errno;
1136         }
1137
1138         return 0;
1139 }
1140
1141 static int set_securebits(unsigned bits, unsigned mask) {
1142         unsigned applied;
1143         int current;
1144
1145         current = prctl(PR_GET_SECUREBITS);
1146         if (current < 0)
1147                 return -errno;
1148
1149         /* Clear all securebits defined in mask and set bits */
1150         applied = ((unsigned) current & ~mask) | bits;
1151         if ((unsigned) current == applied)
1152                 return 0;
1153
1154         if (prctl(PR_SET_SECUREBITS, applied) < 0)
1155                 return -errno;
1156
1157         return 1;
1158 }
1159
1160 static int enforce_user(
1161                 const ExecContext *context,
1162                 uid_t uid,
1163                 uint64_t capability_ambient_set) {
1164         assert(context);
1165         int r;
1166
1167         if (!uid_is_valid(uid))
1168                 return 0;
1169
1170         /* Sets (but doesn't look up) the UIS and makes sure we keep the capabilities while doing so. For
1171          * setting secure bits the capability CAP_SETPCAP is required, so we also need keep-caps in this
1172          * case. */
1173
1174         if ((capability_ambient_set != 0 || context->secure_bits != 0) && uid != 0) {
1175
1176                 /* First step: If we need to keep capabilities but drop privileges we need to make sure we
1177                  * keep our caps, while we drop privileges. Add KEEP_CAPS to the securebits */
1178                 r = set_securebits(1U << SECURE_KEEP_CAPS, 0);
1179                 if (r < 0)
1180                         return r;
1181         }
1182
1183         /* Second step: actually set the uids */
1184         if (setresuid(uid, uid, uid) < 0)
1185                 return -errno;
1186
1187         /* At this point we should have all necessary capabilities but are otherwise a normal user. However,
1188          * the caps might got corrupted due to the setresuid() so we need clean them up later. This is done
1189          * outside of this call. */
1190         return 0;
1191 }
1192
1193 #if HAVE_PAM
1194
1195 static int null_conv(
1196                 int num_msg,
1197                 const struct pam_message **msg,
1198                 struct pam_response **resp,
1199                 void *appdata_ptr) {
1200
1201         /* We don't support conversations */
1202
1203         return PAM_CONV_ERR;
1204 }
1205
1206 #endif
1207
1208 static int setup_pam(
1209                 const char *name,
1210                 const char *user,
1211                 uid_t uid,
1212                 gid_t gid,
1213                 const char *tty,
1214                 char ***env, /* updated on success */
1215                 const int fds[], size_t n_fds) {
1216
1217 #if HAVE_PAM
1218
1219         static const struct pam_conv conv = {
1220                 .conv = null_conv,
1221                 .appdata_ptr = NULL
1222         };
1223
1224         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1225         _cleanup_strv_free_ char **e = NULL;
1226         pam_handle_t *handle = NULL;
1227         sigset_t old_ss;
1228         int pam_code = PAM_SUCCESS, r;
1229         bool close_session = false;
1230         pid_t pam_pid = 0, parent_pid;
1231         int flags = 0;
1232
1233         assert(name);
1234         assert(user);
1235         assert(env);
1236
1237         /* We set up PAM in the parent process, then fork. The child
1238          * will then stay around until killed via PR_GET_PDEATHSIG or
1239          * systemd via the cgroup logic. It will then remove the PAM
1240          * session again. The parent process will exec() the actual
1241          * daemon. We do things this way to ensure that the main PID
1242          * of the daemon is the one we initially fork()ed. */
1243
1244         r = barrier_create(&barrier);
1245         if (r < 0)
1246                 goto fail;
1247
1248         if (log_get_max_level() < LOG_DEBUG)
1249                 flags |= PAM_SILENT;
1250
1251         pam_code = pam_start(name, user, &conv, &handle);
1252         if (pam_code != PAM_SUCCESS) {
1253                 handle = NULL;
1254                 goto fail;
1255         }
1256
1257         if (!tty) {
1258                 _cleanup_free_ char *q = NULL;
1259
1260                 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1261                  * out if that's the case, and read the TTY off it. */
1262
1263                 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1264                         tty = strjoina("/dev/", q);
1265         }
1266
1267         if (tty) {
1268                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1269                 if (pam_code != PAM_SUCCESS)
1270                         goto fail;
1271         }
1272
1273         STRV_FOREACH(nv, *env) {
1274                 pam_code = pam_putenv(handle, *nv);
1275                 if (pam_code != PAM_SUCCESS)
1276                         goto fail;
1277         }
1278
1279         pam_code = pam_acct_mgmt(handle, flags);
1280         if (pam_code != PAM_SUCCESS)
1281                 goto fail;
1282
1283         pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1284         if (pam_code != PAM_SUCCESS)
1285                 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
1286
1287         pam_code = pam_open_session(handle, flags);
1288         if (pam_code != PAM_SUCCESS)
1289                 goto fail;
1290
1291         close_session = true;
1292
1293         e = pam_getenvlist(handle);
1294         if (!e) {
1295                 pam_code = PAM_BUF_ERR;
1296                 goto fail;
1297         }
1298
1299         /* Block SIGTERM, so that we know that it won't get lost in the child */
1300
1301         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1302
1303         parent_pid = getpid_cached();
1304
1305         r = safe_fork("(sd-pam)", 0, &pam_pid);
1306         if (r < 0)
1307                 goto fail;
1308         if (r == 0) {
1309                 int sig, ret = EXIT_PAM;
1310
1311                 /* The child's job is to reset the PAM session on termination */
1312                 barrier_set_role(&barrier, BARRIER_CHILD);
1313
1314                 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1315                  * those fds are open here that have been opened by PAM. */
1316                 (void) close_many(fds, n_fds);
1317
1318                 /* Drop privileges - we don't need any to pam_close_session and this will make
1319                  * PR_SET_PDEATHSIG work in most cases.  If this fails, ignore the error - but expect sd-pam
1320                  * threads to fail to exit normally */
1321
1322                 r = maybe_setgroups(0, NULL);
1323                 if (r < 0)
1324                         log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1325                 if (setresgid(gid, gid, gid) < 0)
1326                         log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1327                 if (setresuid(uid, uid, uid) < 0)
1328                         log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1329
1330                 (void) ignore_signals(SIGPIPE);
1331
1332                 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1333                  * otherwise the kernel will not allow unprivileged parents kill their privileged children
1334                  * this way. We rely on the control groups kill logic to do the rest for us. */
1335                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1336                         goto child_finish;
1337
1338                 /* Tell the parent that our setup is done. This is especially important regarding dropping
1339                  * privileges. Otherwise, unit setup might race against our setresuid(2) call.
1340                  *
1341                  * If the parent aborted, we'll detect this below, hence ignore return failure here. */
1342                 (void) barrier_place(&barrier);
1343
1344                 /* Check if our parent process might already have died? */
1345                 if (getppid() == parent_pid) {
1346                         sigset_t ss;
1347
1348                         assert_se(sigemptyset(&ss) >= 0);
1349                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1350
1351                         for (;;) {
1352                                 if (sigwait(&ss, &sig) < 0) {
1353                                         if (errno == EINTR)
1354                                                 continue;
1355
1356                                         goto child_finish;
1357                                 }
1358
1359                                 assert(sig == SIGTERM);
1360                                 break;
1361                         }
1362                 }
1363
1364                 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1365                 if (pam_code != PAM_SUCCESS)
1366                         goto child_finish;
1367
1368                 /* If our parent died we'll end the session */
1369                 if (getppid() != parent_pid) {
1370                         pam_code = pam_close_session(handle, flags);
1371                         if (pam_code != PAM_SUCCESS)
1372                                 goto child_finish;
1373                 }
1374
1375                 ret = 0;
1376
1377         child_finish:
1378                 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1379                  * know about this. See pam_end(3) */
1380                 (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
1381                 _exit(ret);
1382         }
1383
1384         barrier_set_role(&barrier, BARRIER_PARENT);
1385
1386         /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1387          * here. */
1388         handle = NULL;
1389
1390         /* Unblock SIGTERM again in the parent */
1391         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1392
1393         /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1394          * this fd around. */
1395         closelog();
1396
1397         /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1398          * recover. However, warn loudly if it happens. */
1399         if (!barrier_place_and_sync(&barrier))
1400                 log_error("PAM initialization failed");
1401
1402         return strv_free_and_replace(*env, e);
1403
1404 fail:
1405         if (pam_code != PAM_SUCCESS) {
1406                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1407                 r = -EPERM;  /* PAM errors do not map to errno */
1408         } else
1409                 log_error_errno(r, "PAM failed: %m");
1410
1411         if (handle) {
1412                 if (close_session)
1413                         pam_code = pam_close_session(handle, flags);
1414
1415                 (void) pam_end(handle, pam_code | flags);
1416         }
1417
1418         closelog();
1419         return r;
1420 #else
1421         return 0;
1422 #endif
1423 }
1424
1425 static void rename_process_from_path(const char *path) {
1426         _cleanup_free_ char *buf = NULL;
1427         const char *p;
1428
1429         assert(path);
1430
1431         /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
1432          * /bin/ps */
1433
1434         if (path_extract_filename(path, &buf) < 0) {
1435                 rename_process("(...)");
1436                 return;
1437         }
1438
1439         size_t l = strlen(buf);
1440         if (l > 8) {
1441                 /* The end of the process name is usually more interesting, since the first bit might just be
1442                  * "systemd-" */
1443                 p = buf + l - 8;
1444                 l = 8;
1445         } else
1446                 p = buf;
1447
1448         char process_name[11];
1449         process_name[0] = '(';
1450         memcpy(process_name+1, p, l);
1451         process_name[1+l] = ')';
1452         process_name[1+l+1] = 0;
1453
1454         rename_process(process_name);
1455 }
1456
1457 static bool context_has_address_families(const ExecContext *c) {
1458         assert(c);
1459
1460         return c->address_families_allow_list ||
1461                 !set_isempty(c->address_families);
1462 }
1463
1464 static bool context_has_syscall_filters(const ExecContext *c) {
1465         assert(c);
1466
1467         return c->syscall_allow_list ||
1468                 !hashmap_isempty(c->syscall_filter);
1469 }
1470
1471 static bool context_has_syscall_logs(const ExecContext *c) {
1472         assert(c);
1473
1474         return c->syscall_log_allow_list ||
1475                 !hashmap_isempty(c->syscall_log);
1476 }
1477
1478 static bool context_has_no_new_privileges(const ExecContext *c) {
1479         assert(c);
1480
1481         if (c->no_new_privileges)
1482                 return true;
1483
1484         if (have_effective_cap(CAP_SYS_ADMIN) > 0) /* if we are privileged, we don't need NNP */
1485                 return false;
1486
1487         /* We need NNP if we have any form of seccomp and are unprivileged */
1488         return c->lock_personality ||
1489                 c->memory_deny_write_execute ||
1490                 c->private_devices ||
1491                 c->protect_clock ||
1492                 c->protect_hostname ||
1493                 c->protect_kernel_tunables ||
1494                 c->protect_kernel_modules ||
1495                 c->protect_kernel_logs ||
1496                 context_has_address_families(c) ||
1497                 exec_context_restrict_namespaces_set(c) ||
1498                 c->restrict_realtime ||
1499                 c->restrict_suid_sgid ||
1500                 !set_isempty(c->syscall_archs) ||
1501                 context_has_syscall_filters(c) ||
1502                 context_has_syscall_logs(c);
1503 }
1504
1505 #if HAVE_SECCOMP
1506
1507 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1508
1509         if (is_seccomp_available())
1510                 return false;
1511
1512         log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1513         return true;
1514 }
1515
1516 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1517         uint32_t negative_action, default_action, action;
1518         int r;
1519
1520         assert(u);
1521         assert(c);
1522
1523         if (!context_has_syscall_filters(c))
1524                 return 0;
1525
1526         if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1527                 return 0;
1528
1529         negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1530
1531         if (c->syscall_allow_list) {
1532                 default_action = negative_action;
1533                 action = SCMP_ACT_ALLOW;
1534         } else {
1535                 default_action = SCMP_ACT_ALLOW;
1536                 action = negative_action;
1537         }
1538
1539         if (needs_ambient_hack) {
1540                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1541                 if (r < 0)
1542                         return r;
1543         }
1544
1545         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1546 }
1547
1548 static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1549 #ifdef SCMP_ACT_LOG
1550         uint32_t default_action, action;
1551 #endif
1552
1553         assert(u);
1554         assert(c);
1555
1556         if (!context_has_syscall_logs(c))
1557                 return 0;
1558
1559 #ifdef SCMP_ACT_LOG
1560         if (skip_seccomp_unavailable(u, "SystemCallLog="))
1561                 return 0;
1562
1563         if (c->syscall_log_allow_list) {
1564                 /* Log nothing but the ones listed */
1565                 default_action = SCMP_ACT_ALLOW;
1566                 action = SCMP_ACT_LOG;
1567         } else {
1568                 /* Log everything but the ones listed */
1569                 default_action = SCMP_ACT_LOG;
1570                 action = SCMP_ACT_ALLOW;
1571         }
1572
1573         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1574 #else
1575         /* old libseccomp */
1576         log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1577         return 0;
1578 #endif
1579 }
1580
1581 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1582         assert(u);
1583         assert(c);
1584
1585         if (set_isempty(c->syscall_archs))
1586                 return 0;
1587
1588         if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1589                 return 0;
1590
1591         return seccomp_restrict_archs(c->syscall_archs);
1592 }
1593
1594 static int apply_address_families(const Unit* u, const ExecContext *c) {
1595         assert(u);
1596         assert(c);
1597
1598         if (!context_has_address_families(c))
1599                 return 0;
1600
1601         if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1602                 return 0;
1603
1604         return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1605 }
1606
1607 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1608         int r;
1609
1610         assert(u);
1611         assert(c);
1612
1613         if (!c->memory_deny_write_execute)
1614                 return 0;
1615
1616         /* use prctl() if kernel supports it (6.3) */
1617         r = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0, 0, 0);
1618         if (r == 0) {
1619                 log_unit_debug(u, "Enabled MemoryDenyWriteExecute= with PR_SET_MDWE");
1620                 return 0;
1621         }
1622         if (r < 0 && errno != EINVAL)
1623                 return log_unit_debug_errno(u, errno, "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m");
1624         /* else use seccomp */
1625         log_unit_debug(u, "Kernel doesn't support PR_SET_MDWE: falling back to seccomp");
1626
1627         if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1628                 return 0;
1629
1630         return seccomp_memory_deny_write_execute();
1631 }
1632
1633 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1634         assert(u);
1635         assert(c);
1636
1637         if (!c->restrict_realtime)
1638                 return 0;
1639
1640         if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1641                 return 0;
1642
1643         return seccomp_restrict_realtime();
1644 }
1645
1646 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1647         assert(u);
1648         assert(c);
1649
1650         if (!c->restrict_suid_sgid)
1651                 return 0;
1652
1653         if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1654                 return 0;
1655
1656         return seccomp_restrict_suid_sgid();
1657 }
1658
1659 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1660         assert(u);
1661         assert(c);
1662
1663         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1664          * let's protect even those systems where this is left on in the kernel. */
1665
1666         if (!c->protect_kernel_tunables)
1667                 return 0;
1668
1669         if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1670                 return 0;
1671
1672         return seccomp_protect_sysctl();
1673 }
1674
1675 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1676         assert(u);
1677         assert(c);
1678
1679         /* Turn off module syscalls on ProtectKernelModules=yes */
1680
1681         if (!c->protect_kernel_modules)
1682                 return 0;
1683
1684         if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1685                 return 0;
1686
1687         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1688 }
1689
1690 static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1691         assert(u);
1692         assert(c);
1693
1694         if (!c->protect_kernel_logs)
1695                 return 0;
1696
1697         if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1698                 return 0;
1699
1700         return seccomp_protect_syslog();
1701 }
1702
1703 static int apply_protect_clock(const Unit *u, const ExecContext *c) {
1704         assert(u);
1705         assert(c);
1706
1707         if (!c->protect_clock)
1708                 return 0;
1709
1710         if (skip_seccomp_unavailable(u, "ProtectClock="))
1711                 return 0;
1712
1713         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1714 }
1715
1716 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1717         assert(u);
1718         assert(c);
1719
1720         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1721
1722         if (!c->private_devices)
1723                 return 0;
1724
1725         if (skip_seccomp_unavailable(u, "PrivateDevices="))
1726                 return 0;
1727
1728         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1729 }
1730
1731 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1732         assert(u);
1733         assert(c);
1734
1735         if (!exec_context_restrict_namespaces_set(c))
1736                 return 0;
1737
1738         if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1739                 return 0;
1740
1741         return seccomp_restrict_namespaces(c->restrict_namespaces);
1742 }
1743
1744 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1745         unsigned long personality;
1746         int r;
1747
1748         assert(u);
1749         assert(c);
1750
1751         if (!c->lock_personality)
1752                 return 0;
1753
1754         if (skip_seccomp_unavailable(u, "LockPersonality="))
1755                 return 0;
1756
1757         personality = c->personality;
1758
1759         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1760         if (personality == PERSONALITY_INVALID) {
1761
1762                 r = opinionated_personality(&personality);
1763                 if (r < 0)
1764                         return r;
1765         }
1766
1767         return seccomp_lock_personality(personality);
1768 }
1769
1770 #endif
1771
1772 #if HAVE_LIBBPF
1773 static int apply_restrict_filesystems(Unit *u, const ExecContext *c) {
1774         assert(u);
1775         assert(c);
1776
1777         if (!exec_context_restrict_filesystems_set(c))
1778                 return 0;
1779
1780         if (!u->manager->restrict_fs) {
1781                 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1782                 log_unit_debug(u, "LSM BPF not supported, skipping RestrictFileSystems=");
1783                 return 0;
1784         }
1785
1786         return lsm_bpf_unit_restrict_filesystems(u, c->restrict_filesystems, c->restrict_filesystems_allow_list);
1787 }
1788 #endif
1789
1790 static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
1791         assert(u);
1792         assert(c);
1793
1794         if (!c->protect_hostname)
1795                 return 0;
1796
1797         if (ns_type_supported(NAMESPACE_UTS)) {
1798                 if (unshare(CLONE_NEWUTS) < 0) {
1799                         if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1800                                 *ret_exit_status = EXIT_NAMESPACE;
1801                                 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1802                         }
1803
1804                         log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1805                 }
1806         } else
1807                 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1808
1809 #if HAVE_SECCOMP
1810         int r;
1811
1812         if (skip_seccomp_unavailable(u, "ProtectHostname="))
1813                 return 0;
1814
1815         r = seccomp_protect_hostname();
1816         if (r < 0) {
1817                 *ret_exit_status = EXIT_SECCOMP;
1818                 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1819         }
1820 #endif
1821
1822         return 0;
1823 }
1824
1825 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1826         assert(idle_pipe);
1827
1828         idle_pipe[1] = safe_close(idle_pipe[1]);
1829         idle_pipe[2] = safe_close(idle_pipe[2]);
1830
1831         if (idle_pipe[0] >= 0) {
1832                 int r;
1833
1834                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1835
1836                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1837                         ssize_t n;
1838
1839                         /* Signal systemd that we are bored and want to continue. */
1840                         n = write(idle_pipe[3], "x", 1);
1841                         if (n > 0)
1842                                 /* Wait for systemd to react to the signal above. */
1843                                 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1844                 }
1845
1846                 idle_pipe[0] = safe_close(idle_pipe[0]);
1847
1848         }
1849
1850         idle_pipe[3] = safe_close(idle_pipe[3]);
1851 }
1852
1853 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1854
1855 static int build_environment(
1856                 const Unit *u,
1857                 const ExecContext *c,
1858                 const ExecParameters *p,
1859                 const CGroupContext *cgroup_context,
1860                 size_t n_fds,
1861                 char **fdnames,
1862                 const char *home,
1863                 const char *username,
1864                 const char *shell,
1865                 dev_t journal_stream_dev,
1866                 ino_t journal_stream_ino,
1867                 const char *memory_pressure_path,
1868                 char ***ret) {
1869
1870         _cleanup_strv_free_ char **our_env = NULL;
1871         size_t n_env = 0;
1872         char *x;
1873         int r;
1874
1875         assert(u);
1876         assert(c);
1877         assert(p);
1878         assert(ret);
1879
1880 #define N_ENV_VARS 19
1881         our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1882         if (!our_env)
1883                 return -ENOMEM;
1884
1885         if (n_fds > 0) {
1886                 _cleanup_free_ char *joined = NULL;
1887
1888                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1889                         return -ENOMEM;
1890                 our_env[n_env++] = x;
1891
1892                 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1893                         return -ENOMEM;
1894                 our_env[n_env++] = x;
1895
1896                 joined = strv_join(fdnames, ":");
1897                 if (!joined)
1898                         return -ENOMEM;
1899
1900                 x = strjoin("LISTEN_FDNAMES=", joined);
1901                 if (!x)
1902                         return -ENOMEM;
1903                 our_env[n_env++] = x;
1904         }
1905
1906         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1907                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1908                         return -ENOMEM;
1909                 our_env[n_env++] = x;
1910
1911                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1912                         return -ENOMEM;
1913                 our_env[n_env++] = x;
1914         }
1915
1916         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1917          * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1918          * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1919         if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
1920                 x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
1921                 if (!x)
1922                         return -ENOMEM;
1923                 our_env[n_env++] = x;
1924         }
1925
1926         /* We query "root" if this is a system unit and User= is not specified. $USER is always set. $HOME
1927          * could cause problem for e.g. getty, since login doesn't override $HOME, and $LOGNAME and $SHELL don't
1928          * really make much sense since we're not logged in. Hence we conditionalize the three based on
1929          * SetLoginEnvironment= switch. */
1930         if (!c->user && !c->dynamic_user && p->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
1931                 r = get_fixed_user("root", &username, NULL, NULL, &home, &shell);
1932                 if (r < 0)
1933                         return log_unit_error_errno(u, r, "Failed to determine user credentials for root: %m");
1934         }
1935
1936         bool set_user_login_env = c->set_login_environment >= 0 ? c->set_login_environment : (c->user || c->dynamic_user);
1937
1938         if (username) {
1939                 x = strjoin("USER=", username);
1940                 if (!x)
1941                         return -ENOMEM;
1942                 our_env[n_env++] = x;
1943
1944                 if (set_user_login_env) {
1945                         x = strjoin("LOGNAME=", username);
1946                         if (!x)
1947                                 return -ENOMEM;
1948                         our_env[n_env++] = x;
1949                 }
1950         }
1951
1952         if (home && set_user_login_env) {
1953                 x = strjoin("HOME=", home);
1954                 if (!x)
1955                         return -ENOMEM;
1956
1957                 path_simplify(x + 5);
1958                 our_env[n_env++] = x;
1959         }
1960
1961         if (shell && set_user_login_env) {
1962                 x = strjoin("SHELL=", shell);
1963                 if (!x)
1964                         return -ENOMEM;
1965
1966                 path_simplify(x + 6);
1967                 our_env[n_env++] = x;
1968         }
1969
1970         if (!sd_id128_is_null(u->invocation_id)) {
1971                 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1972                         return -ENOMEM;
1973
1974                 our_env[n_env++] = x;
1975         }
1976
1977         if (exec_context_needs_term(c)) {
1978                 _cleanup_free_ char *cmdline = NULL;
1979                 const char *tty_path, *term = NULL;
1980
1981                 tty_path = exec_context_tty_path(c);
1982
1983                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1984                  * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1985                  * container manager passes to PID 1 ends up all the way in the console login shown. */
1986
1987                 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
1988                         term = getenv("TERM");
1989                 else if (tty_path && in_charset(skip_dev_prefix(tty_path), ALPHANUMERICAL)) {
1990                         _cleanup_free_ char *key = NULL;
1991
1992                         key = strjoin("systemd.tty.term.", skip_dev_prefix(tty_path));
1993                         if (!key)
1994                                 return -ENOMEM;
1995
1996                         r = proc_cmdline_get_key(key, 0, &cmdline);
1997                         if (r < 0)
1998                                 log_debug_errno(r, "Failed to read %s from kernel cmdline, ignoring: %m", key);
1999                         else if (r > 0)
2000                                 term = cmdline;
2001                 }
2002
2003                 if (!term)
2004                         term = default_term_for_tty(tty_path);
2005
2006                 x = strjoin("TERM=", term);
2007                 if (!x)
2008                         return -ENOMEM;
2009                 our_env[n_env++] = x;
2010         }
2011
2012         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
2013                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
2014                         return -ENOMEM;
2015
2016                 our_env[n_env++] = x;
2017         }
2018
2019         if (c->log_namespace) {
2020                 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
2021                 if (!x)
2022                         return -ENOMEM;
2023
2024                 our_env[n_env++] = x;
2025         }
2026
2027         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2028                 _cleanup_free_ char *joined = NULL;
2029                 const char *n;
2030
2031                 if (!p->prefix[t])
2032                         continue;
2033
2034                 if (c->directories[t].n_items == 0)
2035                         continue;
2036
2037                 n = exec_directory_env_name_to_string(t);
2038                 if (!n)
2039                         continue;
2040
2041                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
2042                         _cleanup_free_ char *prefixed = NULL;
2043
2044                         prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
2045                         if (!prefixed)
2046                                 return -ENOMEM;
2047
2048                         if (!strextend_with_separator(&joined, ":", prefixed))
2049                                 return -ENOMEM;
2050                 }
2051
2052                 x = strjoin(n, "=", joined);
2053                 if (!x)
2054                         return -ENOMEM;
2055
2056                 our_env[n_env++] = x;
2057         }
2058
2059         _cleanup_free_ char *creds_dir = NULL;
2060         r = exec_context_get_credential_directory(c, p, u->id, &creds_dir);
2061         if (r < 0)
2062                 return r;
2063         if (r > 0) {
2064                 x = strjoin("CREDENTIALS_DIRECTORY=", creds_dir);
2065                 if (!x)
2066                         return -ENOMEM;
2067
2068                 our_env[n_env++] = x;
2069         }
2070
2071         if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
2072                 return -ENOMEM;
2073
2074         our_env[n_env++] = x;
2075
2076         if (memory_pressure_path) {
2077                 x = strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path);
2078                 if (!x)
2079                         return -ENOMEM;
2080
2081                 our_env[n_env++] = x;
2082
2083                 if (cgroup_context && !path_equal(memory_pressure_path, "/dev/null")) {
2084                         _cleanup_free_ char *b = NULL, *e = NULL;
2085
2086                         if (asprintf(&b, "%s " USEC_FMT " " USEC_FMT,
2087                                      MEMORY_PRESSURE_DEFAULT_TYPE,
2088                                      cgroup_context->memory_pressure_threshold_usec == USEC_INFINITY ? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC :
2089                                      CLAMP(cgroup_context->memory_pressure_threshold_usec, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC),
2090                                      MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2091                                 return -ENOMEM;
2092
2093                         if (base64mem(b, strlen(b) + 1, &e) < 0)
2094                                 return -ENOMEM;
2095
2096                         x = strjoin("MEMORY_PRESSURE_WRITE=", e);
2097                         if (!x)
2098                                 return -ENOMEM;
2099
2100                         our_env[n_env++] = x;
2101                 }
2102         }
2103
2104         assert(n_env < N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
2105 #undef N_ENV_VARS
2106
2107         *ret = TAKE_PTR(our_env);
2108
2109         return 0;
2110 }
2111
2112 static int build_pass_environment(const ExecContext *c, char ***ret) {
2113         _cleanup_strv_free_ char **pass_env = NULL;
2114         size_t n_env = 0;
2115
2116         STRV_FOREACH(i, c->pass_environment) {
2117                 _cleanup_free_ char *x = NULL;
2118                 char *v;
2119
2120                 v = getenv(*i);
2121                 if (!v)
2122                         continue;
2123                 x = strjoin(*i, "=", v);
2124                 if (!x)
2125                         return -ENOMEM;
2126
2127                 if (!GREEDY_REALLOC(pass_env, n_env + 2))
2128                         return -ENOMEM;
2129
2130                 pass_env[n_env++] = TAKE_PTR(x);
2131                 pass_env[n_env] = NULL;
2132         }
2133
2134         *ret = TAKE_PTR(pass_env);
2135
2136         return 0;
2137 }
2138
2139 bool exec_needs_network_namespace(const ExecContext *context) {
2140         assert(context);
2141
2142         return context->private_network || context->network_namespace_path;
2143 }
2144
2145 static bool exec_needs_ephemeral(const ExecContext *context) {
2146         return (context->root_image || context->root_directory) && context->root_ephemeral;
2147 }
2148
2149 static bool exec_needs_ipc_namespace(const ExecContext *context) {
2150         assert(context);
2151
2152         return context->private_ipc || context->ipc_namespace_path;
2153 }
2154
2155 bool exec_needs_mount_namespace(
2156                 const ExecContext *context,
2157                 const ExecParameters *params,
2158                 const ExecRuntime *runtime) {
2159
2160         assert(context);
2161
2162         if (context->root_image)
2163                 return true;
2164
2165         if (!strv_isempty(context->read_write_paths) ||
2166             !strv_isempty(context->read_only_paths) ||
2167             !strv_isempty(context->inaccessible_paths) ||
2168             !strv_isempty(context->exec_paths) ||
2169             !strv_isempty(context->no_exec_paths))
2170                 return true;
2171
2172         if (context->n_bind_mounts > 0)
2173                 return true;
2174
2175         if (context->n_temporary_filesystems > 0)
2176                 return true;
2177
2178         if (context->n_mount_images > 0)
2179                 return true;
2180
2181         if (context->n_extension_images > 0)
2182                 return true;
2183
2184         if (!strv_isempty(context->extension_directories))
2185                 return true;
2186
2187         if (!IN_SET(context->mount_propagation_flag, 0, MS_SHARED))
2188                 return true;
2189
2190         if (context->private_tmp && runtime && runtime->shared && (runtime->shared->tmp_dir || runtime->shared->var_tmp_dir))
2191                 return true;
2192
2193         if (context->private_devices ||
2194             context->private_mounts > 0 ||
2195             (context->private_mounts < 0 && exec_needs_network_namespace(context)) ||
2196             context->protect_system != PROTECT_SYSTEM_NO ||
2197             context->protect_home != PROTECT_HOME_NO ||
2198             context->protect_kernel_tunables ||
2199             context->protect_kernel_modules ||
2200             context->protect_kernel_logs ||
2201             context->protect_control_groups ||
2202             context->protect_proc != PROTECT_PROC_DEFAULT ||
2203             context->proc_subset != PROC_SUBSET_ALL ||
2204             exec_needs_ipc_namespace(context))
2205                 return true;
2206
2207         if (context->root_directory) {
2208                 if (exec_context_get_effective_mount_apivfs(context))
2209                         return true;
2210
2211                 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2212                         if (params && !params->prefix[t])
2213                                 continue;
2214
2215                         if (context->directories[t].n_items > 0)
2216                                 return true;
2217                 }
2218         }
2219
2220         if (context->dynamic_user &&
2221             (context->directories[EXEC_DIRECTORY_STATE].n_items > 0 ||
2222              context->directories[EXEC_DIRECTORY_CACHE].n_items > 0 ||
2223              context->directories[EXEC_DIRECTORY_LOGS].n_items > 0))
2224                 return true;
2225
2226         if (context->log_namespace)
2227                 return true;
2228
2229         return false;
2230 }
2231
2232 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
2233         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2234         _cleanup_close_pair_ int errno_pipe[2] = PIPE_EBADF;
2235         _cleanup_close_ int unshare_ready_fd = -EBADF;
2236         _cleanup_(sigkill_waitp) pid_t pid = 0;
2237         uint64_t c = 1;
2238         ssize_t n;
2239         int r;
2240
2241         /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2242          * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2243          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2244          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2245          * which waits for the parent to create the new user namespace while staying in the original namespace. The
2246          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2247          * continues execution normally.
2248          * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2249          * does not need CAP_SETUID to write the single line mapping to itself. */
2250
2251         /* Can only set up multiple mappings with CAP_SETUID. */
2252         if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid))
2253                 r = asprintf(&uid_map,
2254                              UID_FMT " " UID_FMT " 1\n"     /* Map $OUID → $OUID */
2255                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
2256                              ouid, ouid, uid, uid);
2257         else
2258                 r = asprintf(&uid_map,
2259                              UID_FMT " " UID_FMT " 1\n",    /* Map $OUID → $OUID */
2260                              ouid, ouid);
2261
2262         if (r < 0)
2263                 return -ENOMEM;
2264
2265         /* Can only set up multiple mappings with CAP_SETGID. */
2266         if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid))
2267                 r = asprintf(&gid_map,
2268                              GID_FMT " " GID_FMT " 1\n"     /* Map $OGID → $OGID */
2269                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
2270                              ogid, ogid, gid, gid);
2271         else
2272                 r = asprintf(&gid_map,
2273                              GID_FMT " " GID_FMT " 1\n",    /* Map $OGID -> $OGID */
2274                              ogid, ogid);
2275
2276         if (r < 0)
2277                 return -ENOMEM;
2278
2279         /* Create a communication channel so that the parent can tell the child when it finished creating the user
2280          * namespace. */
2281         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2282         if (unshare_ready_fd < 0)
2283                 return -errno;
2284
2285         /* Create a communication channel so that the child can tell the parent a proper error code in case it
2286          * failed. */
2287         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2288                 return -errno;
2289
2290         r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2291         if (r < 0)
2292                 return r;
2293         if (r == 0) {
2294                 _cleanup_close_ int fd = -EBADF;
2295                 const char *a;
2296                 pid_t ppid;
2297
2298                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2299                  * here, after the parent opened its own user namespace. */
2300
2301                 ppid = getppid();
2302                 errno_pipe[0] = safe_close(errno_pipe[0]);
2303
2304                 /* Wait until the parent unshared the user namespace */
2305                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2306                         r = -errno;
2307                         goto child_fail;
2308                 }
2309
2310                 /* Disable the setgroups() system call in the child user namespace, for good. */
2311                 a = procfs_file_alloca(ppid, "setgroups");
2312                 fd = open(a, O_WRONLY|O_CLOEXEC);
2313                 if (fd < 0) {
2314                         if (errno != ENOENT) {
2315                                 r = -errno;
2316                                 goto child_fail;
2317                         }
2318
2319                         /* If the file is missing the kernel is too old, let's continue anyway. */
2320                 } else {
2321                         if (write(fd, "deny\n", 5) < 0) {
2322                                 r = -errno;
2323                                 goto child_fail;
2324                         }
2325
2326                         fd = safe_close(fd);
2327                 }
2328
2329                 /* First write the GID map */
2330                 a = procfs_file_alloca(ppid, "gid_map");
2331                 fd = open(a, O_WRONLY|O_CLOEXEC);
2332                 if (fd < 0) {
2333                         r = -errno;
2334                         goto child_fail;
2335                 }
2336                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2337                         r = -errno;
2338                         goto child_fail;
2339                 }
2340                 fd = safe_close(fd);
2341
2342                 /* The write the UID map */
2343                 a = procfs_file_alloca(ppid, "uid_map");
2344                 fd = open(a, O_WRONLY|O_CLOEXEC);
2345                 if (fd < 0) {
2346                         r = -errno;
2347                         goto child_fail;
2348                 }
2349                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2350                         r = -errno;
2351                         goto child_fail;
2352                 }
2353
2354                 _exit(EXIT_SUCCESS);
2355
2356         child_fail:
2357                 (void) write(errno_pipe[1], &r, sizeof(r));
2358                 _exit(EXIT_FAILURE);
2359         }
2360
2361         errno_pipe[1] = safe_close(errno_pipe[1]);
2362
2363         if (unshare(CLONE_NEWUSER) < 0)
2364                 return -errno;
2365
2366         /* Let the child know that the namespace is ready now */
2367         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2368                 return -errno;
2369
2370         /* Try to read an error code from the child */
2371         n = read(errno_pipe[0], &r, sizeof(r));
2372         if (n < 0)
2373                 return -errno;
2374         if (n == sizeof(r)) { /* an error code was sent to us */
2375                 if (r < 0)
2376                         return r;
2377                 return -EIO;
2378         }
2379         if (n != 0) /* on success we should have read 0 bytes */
2380                 return -EIO;
2381
2382         r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
2383         if (r < 0)
2384                 return r;
2385         if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2386                 return -EIO;
2387
2388         return 0;
2389 }
2390
2391 static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2392         assert(context);
2393
2394         if (!context->dynamic_user)
2395                 return false;
2396
2397         if (type == EXEC_DIRECTORY_CONFIGURATION)
2398                 return false;
2399
2400         if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2401                 return false;
2402
2403         return true;
2404 }
2405
2406 static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2407         _cleanup_free_ char *src_abs = NULL;
2408         int r;
2409
2410         assert(source);
2411
2412         src_abs = path_join(root, source);
2413         if (!src_abs)
2414                 return -ENOMEM;
2415
2416         STRV_FOREACH(dst, symlinks) {
2417                 _cleanup_free_ char *dst_abs = NULL;
2418
2419                 dst_abs = path_join(root, *dst);
2420                 if (!dst_abs)
2421                         return -ENOMEM;
2422
2423                 r = mkdir_parents_label(dst_abs, 0755);
2424                 if (r < 0)
2425                         return r;
2426
2427                 r = symlink_idempotent(src_abs, dst_abs, true);
2428                 if (r < 0)
2429                         return r;
2430         }
2431
2432         return 0;
2433 }
2434
2435 static int setup_exec_directory(
2436                 Unit *u,
2437                 const ExecContext *context,
2438                 const ExecParameters *params,
2439                 uid_t uid,
2440                 gid_t gid,
2441                 ExecDirectoryType type,
2442                 bool needs_mount_namespace,
2443                 int *exit_status) {
2444
2445         static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2446                 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2447                 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2448                 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2449                 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2450                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2451         };
2452         int r;
2453
2454         assert(context);
2455         assert(params);
2456         assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2457         assert(exit_status);
2458
2459         if (!params->prefix[type])
2460                 return 0;
2461
2462         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2463                 if (!uid_is_valid(uid))
2464                         uid = 0;
2465                 if (!gid_is_valid(gid))
2466                         gid = 0;
2467         }
2468
2469         for (size_t i = 0; i < context->directories[type].n_items; i++) {
2470                 _cleanup_free_ char *p = NULL, *pp = NULL;
2471
2472                 p = path_join(params->prefix[type], context->directories[type].items[i].path);
2473                 if (!p) {
2474                         r = -ENOMEM;
2475                         goto fail;
2476                 }
2477
2478                 r = mkdir_parents_label(p, 0755);
2479                 if (r < 0)
2480                         goto fail;
2481
2482                 if (IN_SET(type, EXEC_DIRECTORY_STATE, EXEC_DIRECTORY_LOGS) && params->runtime_scope == RUNTIME_SCOPE_USER) {
2483
2484                         /* If we are in user mode, and a configuration directory exists but a state directory
2485                          * doesn't exist, then we likely are upgrading from an older systemd version that
2486                          * didn't know the more recent addition to the xdg-basedir spec: the $XDG_STATE_HOME
2487                          * directory. In older systemd versions EXEC_DIRECTORY_STATE was aliased to
2488                          * EXEC_DIRECTORY_CONFIGURATION, with the advent of $XDG_STATE_HOME is is now
2489                          * separated. If a service has both dirs configured but only the configuration dir
2490                          * exists and the state dir does not, we assume we are looking at an update
2491                          * situation. Hence, create a compatibility symlink, so that all expectations are
2492                          * met.
2493                          *
2494                          * (We also do something similar with the log directory, which still doesn't exist in
2495                          * the xdg basedir spec. We'll make it a subdir of the state dir.) */
2496
2497                         /* this assumes the state dir is always created before the configuration dir */
2498                         assert_cc(EXEC_DIRECTORY_STATE < EXEC_DIRECTORY_LOGS);
2499                         assert_cc(EXEC_DIRECTORY_LOGS < EXEC_DIRECTORY_CONFIGURATION);
2500
2501                         r = laccess(p, F_OK);
2502                         if (r == -ENOENT) {
2503                                 _cleanup_free_ char *q = NULL;
2504
2505                                 /* OK, we know that the state dir does not exist. Let's see if the dir exists
2506                                  * under the configuration hierarchy. */
2507
2508                                 if (type == EXEC_DIRECTORY_STATE)
2509                                         q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], context->directories[type].items[i].path);
2510                                 else if (type == EXEC_DIRECTORY_LOGS)
2511                                         q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], "log", context->directories[type].items[i].path);
2512                                 else
2513                                         assert_not_reached();
2514                                 if (!q) {
2515                                         r = -ENOMEM;
2516                                         goto fail;
2517                                 }
2518
2519                                 r = laccess(q, F_OK);
2520                                 if (r >= 0) {
2521                                         /* It does exist! This hence looks like an update. Symlink the
2522                                          * configuration directory into the state directory. */
2523
2524                                         r = symlink_idempotent(q, p, /* make_relative= */ true);
2525                                         if (r < 0)
2526                                                 goto fail;
2527
2528                                         log_unit_notice(u, "Unit state directory %s missing but matching configuration directory %s exists, assuming update from systemd 253 or older, creating compatibility symlink.", p, q);
2529                                         continue;
2530                                 } else if (r != -ENOENT)
2531                                         log_unit_warning_errno(u, r, "Unable to detect whether unit configuration directory '%s' exists, assuming not: %m", q);
2532
2533                         } else if (r < 0)
2534                                 log_unit_warning_errno(u, r, "Unable to detect whether unit state directory '%s' is missing, assuming it is: %m", p);
2535                 }
2536
2537                 if (exec_directory_is_private(context, type)) {
2538                         /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2539                          * case we want to avoid leaving a directory around fully accessible that is owned by
2540                          * a dynamic user whose UID is later on reused. To lock this down we use the same
2541                          * trick used by container managers to prohibit host users to get access to files of
2542                          * the same UID in containers: we place everything inside a directory that has an
2543                          * access mode of 0700 and is owned root:root, so that it acts as security boundary
2544                          * for unprivileged host code. We then use fs namespacing to make this directory
2545                          * permeable for the service itself.
2546                          *
2547                          * Specifically: for a service which wants a special directory "foo/" we first create
2548                          * a directory "private/" with access mode 0700 owned by root:root. Then we place
2549                          * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2550                          * "private/foo". This way, privileged host users can access "foo/" as usual, but
2551                          * unprivileged host users can't look into it. Inside of the namespace of the unit
2552                          * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2553                          * "private/foo/" is mounted under the same name, thus disabling the access boundary
2554                          * for the service and making sure it only gets access to the dirs it needs but no
2555                          * others. Tricky? Yes, absolutely, but it works!
2556                          *
2557                          * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2558                          * to be owned by the service itself.
2559                          *
2560                          * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2561                          * for sharing files or sockets with other services. */
2562
2563                         pp = path_join(params->prefix[type], "private");
2564                         if (!pp) {
2565                                 r = -ENOMEM;
2566                                 goto fail;
2567                         }
2568
2569                         /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2570                         r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
2571                         if (r < 0)
2572                                 goto fail;
2573
2574                         if (!path_extend(&pp, context->directories[type].items[i].path)) {
2575                                 r = -ENOMEM;
2576                                 goto fail;
2577                         }
2578
2579                         /* Create all directories between the configured directory and this private root, and mark them 0755 */
2580                         r = mkdir_parents_label(pp, 0755);
2581                         if (r < 0)
2582                                 goto fail;
2583
2584                         if (is_dir(p, false) > 0 &&
2585                             (laccess(pp, F_OK) == -ENOENT)) {
2586
2587                                 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2588                                  * it over. Most likely the service has been upgraded from one that didn't use
2589                                  * DynamicUser=1, to one that does. */
2590
2591                                 log_unit_info(u, "Found pre-existing public %s= directory %s, migrating to %s.\n"
2592                                               "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2593                                               exec_directory_type_to_string(type), p, pp);
2594
2595                                 r = RET_NERRNO(rename(p, pp));
2596                                 if (r < 0)
2597                                         goto fail;
2598                         } else {
2599                                 /* Otherwise, create the actual directory for the service */
2600
2601                                 r = mkdir_label(pp, context->directories[type].mode);
2602                                 if (r < 0 && r != -EEXIST)
2603                                         goto fail;
2604                         }
2605
2606                         if (!context->directories[type].items[i].only_create) {
2607                                 /* And link it up from the original place.
2608                                  * Notes
2609                                  * 1) If a mount namespace is going to be used, then this symlink remains on
2610                                  *    the host, and a new one for the child namespace will be created later.
2611                                  * 2) It is not necessary to create this symlink when one of its parent
2612                                  *    directories is specified and already created. E.g.
2613                                  *        StateDirectory=foo foo/bar
2614                                  *    In that case, the inode points to pp and p for "foo/bar" are the same:
2615                                  *        pp = "/var/lib/private/foo/bar"
2616                                  *        p = "/var/lib/foo/bar"
2617                                  *    and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2618                                  *    we do not need to create the symlink, but we cannot create the symlink.
2619                                  *    See issue #24783. */
2620                                 r = symlink_idempotent(pp, p, true);
2621                                 if (r < 0)
2622                                         goto fail;
2623                         }
2624
2625                 } else {
2626                         _cleanup_free_ char *target = NULL;
2627
2628                         if (type != EXEC_DIRECTORY_CONFIGURATION &&
2629                             readlink_and_make_absolute(p, &target) >= 0) {
2630                                 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2631
2632                                 /* This already exists and is a symlink? Interesting. Maybe it's one created
2633                                  * by DynamicUser=1 (see above)?
2634                                  *
2635                                  * We do this for all directory types except for ConfigurationDirectory=,
2636                                  * since they all support the private/ symlink logic at least in some
2637                                  * configurations, see above. */
2638
2639                                 r = chase(target, NULL, 0, &target_resolved, NULL);
2640                                 if (r < 0)
2641                                         goto fail;
2642
2643                                 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
2644                                 if (!q) {
2645                                         r = -ENOMEM;
2646                                         goto fail;
2647                                 }
2648
2649                                 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2650                                 r = chase(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2651                                 if (r < 0)
2652                                         goto fail;
2653
2654                                 if (path_equal(q_resolved, target_resolved)) {
2655
2656                                         /* Hmm, apparently DynamicUser= was once turned on for this service,
2657                                          * but is no longer. Let's move the directory back up. */
2658
2659                                         log_unit_info(u, "Found pre-existing private %s= directory %s, migrating to %s.\n"
2660                                                       "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2661                                                       exec_directory_type_to_string(type), q, p);
2662
2663                                         r = RET_NERRNO(unlink(p));
2664                                         if (r < 0)
2665                                                 goto fail;
2666
2667                                         r = RET_NERRNO(rename(q, p));
2668                                         if (r < 0)
2669                                                 goto fail;
2670                                 }
2671                         }
2672
2673                         r = mkdir_label(p, context->directories[type].mode);
2674                         if (r < 0) {
2675                                 if (r != -EEXIST)
2676                                         goto fail;
2677
2678                                 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2679                                         struct stat st;
2680
2681                                         /* Don't change the owner/access mode of the configuration directory,
2682                                          * as in the common case it is not written to by a service, and shall
2683                                          * not be writable. */
2684
2685                                         r = RET_NERRNO(stat(p, &st));
2686                                         if (r < 0)
2687                                                 goto fail;
2688
2689                                         /* Still complain if the access mode doesn't match */
2690                                         if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2691                                                 log_unit_warning(u, "%s \'%s\' already exists but the mode is different. "
2692                                                                  "(File system: %o %sMode: %o)",
2693                                                                  exec_directory_type_to_string(type), context->directories[type].items[i].path,
2694                                                                  st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2695
2696                                         continue;
2697                                 }
2698                         }
2699                 }
2700
2701                 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2702                  * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2703                  * current UID/GID ownership.) */
2704                 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2705                 if (r < 0)
2706                         goto fail;
2707
2708                 /* Skip the rest (which deals with ownership) in user mode, since ownership changes are not
2709                  * available to user code anyway */
2710                 if (params->runtime_scope != RUNTIME_SCOPE_SYSTEM)
2711                         continue;
2712
2713                 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2714                  * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2715                  * assignments to exist. */
2716                 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777, AT_SYMLINK_FOLLOW);
2717                 if (r < 0)
2718                         goto fail;
2719         }
2720
2721         /* If we are not going to run in a namespace, set up the symlinks - otherwise
2722          * they are set up later, to allow configuring empty var/run/etc. */
2723         if (!needs_mount_namespace)
2724                 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2725                         r = create_many_symlinks(params->prefix[type],
2726                                                  context->directories[type].items[i].path,
2727                                                  context->directories[type].items[i].symlinks);
2728                         if (r < 0)
2729                                 goto fail;
2730                 }
2731
2732         return 0;
2733
2734 fail:
2735         *exit_status = exit_status_table[type];
2736         return r;
2737 }
2738
2739 #if ENABLE_SMACK
2740 static int setup_smack(
2741                 const Manager *manager,
2742                 const ExecContext *context,
2743                 int executable_fd) {
2744         int r;
2745
2746         assert(context);
2747         assert(executable_fd >= 0);
2748
2749         if (context->smack_process_label) {
2750                 r = mac_smack_apply_pid(0, context->smack_process_label);
2751                 if (r < 0)
2752                         return r;
2753         } else if (manager->defaults.smack_process_label) {
2754                 _cleanup_free_ char *exec_label = NULL;
2755
2756                 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
2757                 if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
2758                         return r;
2759
2760                 r = mac_smack_apply_pid(0, exec_label ?: manager->defaults.smack_process_label);
2761                 if (r < 0)
2762                         return r;
2763         }
2764
2765         return 0;
2766 }
2767 #endif
2768
2769 static int compile_bind_mounts(
2770                 const ExecContext *context,
2771                 const ExecParameters *params,
2772                 BindMount **ret_bind_mounts,
2773                 size_t *ret_n_bind_mounts,
2774                 char ***ret_empty_directories) {
2775
2776         _cleanup_strv_free_ char **empty_directories = NULL;
2777         BindMount *bind_mounts = NULL;
2778         size_t n, h = 0;
2779         int r;
2780
2781         assert(context);
2782         assert(params);
2783         assert(ret_bind_mounts);
2784         assert(ret_n_bind_mounts);
2785         assert(ret_empty_directories);
2786
2787         CLEANUP_ARRAY(bind_mounts, h, bind_mount_free_many);
2788
2789         n = context->n_bind_mounts;
2790         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2791                 if (!params->prefix[t])
2792                         continue;
2793
2794                 for (size_t i = 0; i < context->directories[t].n_items; i++)
2795                         n += !context->directories[t].items[i].only_create;
2796         }
2797
2798         if (n <= 0) {
2799                 *ret_bind_mounts = NULL;
2800                 *ret_n_bind_mounts = 0;
2801                 *ret_empty_directories = NULL;
2802                 return 0;
2803         }
2804
2805         bind_mounts = new(BindMount, n);
2806         if (!bind_mounts)
2807                 return -ENOMEM;
2808
2809         for (size_t i = 0; i < context->n_bind_mounts; i++) {
2810                 BindMount *item = context->bind_mounts + i;
2811                 _cleanup_free_ char *s = NULL, *d = NULL;
2812
2813                 s = strdup(item->source);
2814                 if (!s)
2815                         return -ENOMEM;
2816
2817                 d = strdup(item->destination);
2818                 if (!d)
2819                         return -ENOMEM;
2820
2821                 bind_mounts[h++] = (BindMount) {
2822                         .source = TAKE_PTR(s),
2823                         .destination = TAKE_PTR(d),
2824                         .read_only = item->read_only,
2825                         .recursive = item->recursive,
2826                         .ignore_enoent = item->ignore_enoent,
2827                 };
2828         }
2829
2830         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2831                 if (!params->prefix[t])
2832                         continue;
2833
2834                 if (context->directories[t].n_items == 0)
2835                         continue;
2836
2837                 if (exec_directory_is_private(context, t) &&
2838                     !exec_context_with_rootfs(context)) {
2839                         char *private_root;
2840
2841                         /* So this is for a dynamic user, and we need to make sure the process can access its own
2842                          * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2843                          * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2844
2845                         private_root = path_join(params->prefix[t], "private");
2846                         if (!private_root)
2847                                 return -ENOMEM;
2848
2849                         r = strv_consume(&empty_directories, private_root);
2850                         if (r < 0)
2851                                 return r;
2852                 }
2853
2854                 for (size_t i = 0; i < context->directories[t].n_items; i++) {
2855                         _cleanup_free_ char *s = NULL, *d = NULL;
2856
2857                         /* When one of the parent directories is in the list, we cannot create the symlink
2858                          * for the child directory. See also the comments in setup_exec_directory(). */
2859                         if (context->directories[t].items[i].only_create)
2860                                 continue;
2861
2862                         if (exec_directory_is_private(context, t))
2863                                 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
2864                         else
2865                                 s = path_join(params->prefix[t], context->directories[t].items[i].path);
2866                         if (!s)
2867                                 return -ENOMEM;
2868
2869                         if (exec_directory_is_private(context, t) &&
2870                             exec_context_with_rootfs(context))
2871                                 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2872                                  * directory is not created on the root directory. So, let's bind-mount the directory
2873                                  * on the 'non-private' place. */
2874                                 d = path_join(params->prefix[t], context->directories[t].items[i].path);
2875                         else
2876                                 d = strdup(s);
2877                         if (!d)
2878                                 return -ENOMEM;
2879
2880                         bind_mounts[h++] = (BindMount) {
2881                                 .source = TAKE_PTR(s),
2882                                 .destination = TAKE_PTR(d),
2883                                 .read_only = false,
2884                                 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
2885                                 .recursive = true,
2886                                 .ignore_enoent = false,
2887                         };
2888                 }
2889         }
2890
2891         assert(h == n);
2892
2893         *ret_bind_mounts = TAKE_PTR(bind_mounts);
2894         *ret_n_bind_mounts = n;
2895         *ret_empty_directories = TAKE_PTR(empty_directories);
2896
2897         return (int) n;
2898 }
2899
2900 /* ret_symlinks will contain a list of pairs src:dest that describes
2901  * the symlinks to create later on. For example, the symlinks needed
2902  * to safely give private directories to DynamicUser=1 users. */
2903 static int compile_symlinks(
2904                 const ExecContext *context,
2905                 const ExecParameters *params,
2906                 bool setup_os_release_symlink,
2907                 char ***ret_symlinks) {
2908
2909         _cleanup_strv_free_ char **symlinks = NULL;
2910         int r;
2911
2912         assert(context);
2913         assert(params);
2914         assert(ret_symlinks);
2915
2916         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
2917                 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
2918                         _cleanup_free_ char *private_path = NULL, *path = NULL;
2919
2920                         STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
2921                                 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
2922
2923                                 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
2924                                 dst_abs = path_join(params->prefix[dt], *symlink);
2925                                 if (!src_abs || !dst_abs)
2926                                         return -ENOMEM;
2927
2928                                 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
2929                                 if (r < 0)
2930                                         return r;
2931                         }
2932
2933                         if (!exec_directory_is_private(context, dt) ||
2934                             exec_context_with_rootfs(context) ||
2935                             context->directories[dt].items[i].only_create)
2936                                 continue;
2937
2938                         private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
2939                         if (!private_path)
2940                                 return -ENOMEM;
2941
2942                         path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
2943                         if (!path)
2944                                 return -ENOMEM;
2945
2946                         r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
2947                         if (r < 0)
2948                                 return r;
2949                 }
2950         }
2951
2952         /* We make the host's os-release available via a symlink, so that we can copy it atomically
2953          * and readers will never get a half-written version. Note that, while the paths specified here are
2954          * absolute, when they are processed in namespace.c they will be made relative automatically, i.e.:
2955          * 'os-release -> .os-release-stage/os-release' is what will be created. */
2956         if (setup_os_release_symlink) {
2957                 r = strv_extend(&symlinks, "/run/host/.os-release-stage/os-release");
2958                 if (r < 0)
2959                         return r;
2960
2961                 r = strv_extend(&symlinks, "/run/host/os-release");
2962                 if (r < 0)
2963                         return r;
2964         }
2965
2966         *ret_symlinks = TAKE_PTR(symlinks);
2967
2968         return 0;
2969 }
2970
2971 static bool insist_on_sandboxing(
2972                 const ExecContext *context,
2973                 const char *root_dir,
2974                 const char *root_image,
2975                 const BindMount *bind_mounts,
2976                 size_t n_bind_mounts) {
2977
2978         assert(context);
2979         assert(n_bind_mounts == 0 || bind_mounts);
2980
2981         /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
2982          * would alter the view on the file system beyond making things read-only or invisible, i.e. would
2983          * rearrange stuff in a way we cannot ignore gracefully. */
2984
2985         if (context->n_temporary_filesystems > 0)
2986                 return true;
2987
2988         if (root_dir || root_image)
2989                 return true;
2990
2991         if (context->n_mount_images > 0)
2992                 return true;
2993
2994         if (context->dynamic_user)
2995                 return true;
2996
2997         if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
2998                 return true;
2999
3000         /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3001          * essential. */
3002         for (size_t i = 0; i < n_bind_mounts; i++)
3003                 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
3004                         return true;
3005
3006         if (context->log_namespace)
3007                 return true;
3008
3009         return false;
3010 }
3011
3012 static int setup_ephemeral(const ExecContext *context, ExecRuntime *runtime) {
3013         _cleanup_close_ int fd = -EBADF;
3014         int r;
3015
3016         if (!runtime || !runtime->ephemeral_copy)
3017                 return 0;
3018
3019         r = posix_lock(runtime->ephemeral_storage_socket[0], LOCK_EX);
3020         if (r < 0)
3021                 return log_debug_errno(r, "Failed to lock ephemeral storage socket: %m");
3022
3023         CLEANUP_POSIX_UNLOCK(runtime->ephemeral_storage_socket[0]);
3024
3025         fd = receive_one_fd(runtime->ephemeral_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
3026         if (fd >= 0)
3027                 /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */
3028                 return 0;
3029
3030         if (fd != -EAGAIN)
3031                 return log_debug_errno(fd, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
3032
3033         log_debug("Making ephemeral snapshot of %s to %s",
3034                   context->root_image ?: context->root_directory, runtime->ephemeral_copy);
3035
3036         if (context->root_image)
3037                 fd = copy_file(context->root_image, runtime->ephemeral_copy, O_EXCL, 0600,
3038                                COPY_LOCK_BSD|COPY_REFLINK|COPY_CRTIME);
3039         else
3040                 fd = btrfs_subvol_snapshot_at(AT_FDCWD, context->root_directory,
3041                                               AT_FDCWD, runtime->ephemeral_copy,
3042                                               BTRFS_SNAPSHOT_FALLBACK_COPY |
3043                                               BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
3044                                               BTRFS_SNAPSHOT_RECURSIVE |
3045                                               BTRFS_SNAPSHOT_LOCK_BSD);
3046         if (fd < 0)
3047                 return log_debug_errno(fd, "Failed to snapshot %s to %s: %m",
3048                                        context->root_image ?: context->root_directory, runtime->ephemeral_copy);
3049
3050         if (context->root_image) {
3051                 /* A root image might be subject to lots of random writes so let's try to disable COW on it
3052                  * which tends to not perform well in combination with lots of random writes.
3053                  *
3054                  * Note: btrfs actually isn't impressed by us setting the flag after making the reflink'ed
3055                  * copy, but we at least want to make the intention clear.
3056                  */
3057                 r = chattr_fd(fd, FS_NOCOW_FL, FS_NOCOW_FL, NULL);
3058                 if (r < 0)
3059                         log_debug_errno(fd, "Failed to disable copy-on-write for %s, ignoring: %m", runtime->ephemeral_copy);
3060         }
3061
3062         r = send_one_fd(runtime->ephemeral_storage_socket[1], fd, MSG_DONTWAIT);
3063         if (r < 0)
3064                 return log_debug_errno(r, "Failed to queue file descriptor on ephemeral storage socket: %m");
3065
3066         return 1;
3067 }
3068
3069 static int verity_settings_prepare(
3070                 VeritySettings *verity,
3071                 const char *root_image,
3072                 const void *root_hash,
3073                 size_t root_hash_size,
3074                 const char *root_hash_path,
3075                 const void *root_hash_sig,
3076                 size_t root_hash_sig_size,
3077                 const char *root_hash_sig_path,
3078                 const char *verity_data_path) {
3079
3080         int r;
3081
3082         assert(verity);
3083
3084         if (root_hash) {
3085                 void *d;
3086
3087                 d = memdup(root_hash, root_hash_size);
3088                 if (!d)
3089                         return -ENOMEM;
3090
3091                 free_and_replace(verity->root_hash, d);
3092                 verity->root_hash_size = root_hash_size;
3093                 verity->designator = PARTITION_ROOT;
3094         }
3095
3096         if (root_hash_sig) {
3097                 void *d;
3098
3099                 d = memdup(root_hash_sig, root_hash_sig_size);
3100                 if (!d)
3101                         return -ENOMEM;
3102
3103                 free_and_replace(verity->root_hash_sig, d);
3104                 verity->root_hash_sig_size = root_hash_sig_size;
3105                 verity->designator = PARTITION_ROOT;
3106         }
3107
3108         if (verity_data_path) {
3109                 r = free_and_strdup(&verity->data_path, verity_data_path);
3110                 if (r < 0)
3111                         return r;
3112         }
3113
3114         r = verity_settings_load(
3115                         verity,
3116                         root_image,
3117                         root_hash_path,
3118                         root_hash_sig_path);
3119         if (r < 0)
3120                 return log_debug_errno(r, "Failed to load root hash: %m");
3121
3122         return 0;
3123 }
3124
3125 static int apply_mount_namespace(
3126                 const Unit *u,
3127                 ExecCommandFlags command_flags,
3128                 const ExecContext *context,
3129                 const ExecParameters *params,
3130                 ExecRuntime *runtime,
3131                 const char *memory_pressure_path,
3132                 char **error_path) {
3133
3134         _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
3135         _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
3136                         **read_write_paths_cleanup = NULL;
3137         _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
3138                         *extension_dir = NULL, *host_os_release_stage = NULL;
3139         const char *root_dir = NULL, *root_image = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
3140         char **read_write_paths;
3141         bool needs_sandboxing, setup_os_release_symlink;
3142         BindMount *bind_mounts = NULL;
3143         size_t n_bind_mounts = 0;
3144         int r;
3145
3146         assert(context);
3147
3148         CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many);
3149
3150         if (params->flags & EXEC_APPLY_CHROOT) {
3151                 r = setup_ephemeral(context, runtime);
3152                 if (r < 0)
3153                         return r;
3154
3155                 if (context->root_image)
3156                         root_image = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_image;
3157                 else
3158                         root_dir = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory;
3159         }
3160
3161         r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3162         if (r < 0)
3163                 return r;
3164
3165         /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
3166          * service will need to write to it in order to start the notifications. */
3167         if (context->protect_control_groups && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
3168                 read_write_paths_cleanup = strv_copy(context->read_write_paths);
3169                 if (!read_write_paths_cleanup)
3170                         return -ENOMEM;
3171
3172                 r = strv_extend(&read_write_paths_cleanup, memory_pressure_path);
3173                 if (r < 0)
3174                         return r;
3175
3176                 read_write_paths = read_write_paths_cleanup;
3177         } else
3178                 read_write_paths = context->read_write_paths;
3179
3180         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3181         if (needs_sandboxing) {
3182                 /* The runtime struct only contains the parent of the private /tmp, which is non-accessible
3183                  * to world users. Inside of it there's a /tmp that is sticky, and that's the one we want to
3184                  * use here.  This does not apply when we are using /run/systemd/empty as fallback. */
3185
3186                 if (context->private_tmp && runtime && runtime->shared) {
3187                         if (streq_ptr(runtime->shared->tmp_dir, RUN_SYSTEMD_EMPTY))
3188                                 tmp_dir = runtime->shared->tmp_dir;
3189                         else if (runtime->shared->tmp_dir)
3190                                 tmp_dir = strjoina(runtime->shared->tmp_dir, "/tmp");
3191
3192                         if (streq_ptr(runtime->shared->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3193                                 var_tmp_dir = runtime->shared->var_tmp_dir;
3194                         else if (runtime->shared->var_tmp_dir)
3195                                 var_tmp_dir = strjoina(runtime->shared->var_tmp_dir, "/tmp");
3196                 }
3197         }
3198
3199         /* Symlinks (exec dirs, os-release) are set up after other mounts, before they are made read-only. */
3200         setup_os_release_symlink = needs_sandboxing && exec_context_get_effective_mount_apivfs(context) && (root_dir || root_image);
3201         r = compile_symlinks(context, params, setup_os_release_symlink, &symlinks);
3202         if (r < 0)
3203                 return r;
3204
3205         if (context->mount_propagation_flag == MS_SHARED)
3206                 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3207
3208         if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
3209                 r = exec_context_get_credential_directory(context, params, u->id, &creds_path);
3210                 if (r < 0)
3211                         return r;
3212         }
3213
3214         if (params->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
3215                 propagate_dir = path_join("/run/systemd/propagate/", u->id);
3216                 if (!propagate_dir)
3217                         return -ENOMEM;
3218
3219                 incoming_dir = strdup("/run/systemd/incoming");
3220                 if (!incoming_dir)
3221                         return -ENOMEM;
3222
3223                 extension_dir = strdup("/run/systemd/unit-extensions");
3224                 if (!extension_dir)
3225                         return -ENOMEM;
3226
3227                 /* If running under a different root filesystem, propagate the host's os-release. We make a
3228                  * copy rather than just bind mounting it, so that it can be updated on soft-reboot. */
3229                 if (setup_os_release_symlink) {
3230                         host_os_release_stage = strdup("/run/systemd/propagate/.os-release-stage");
3231                         if (!host_os_release_stage)
3232                                 return -ENOMEM;
3233                 }
3234         } else {
3235                 assert(params->runtime_scope == RUNTIME_SCOPE_USER);
3236
3237                 if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0)
3238                         return -ENOMEM;
3239
3240                 if (setup_os_release_symlink) {
3241                         if (asprintf(&host_os_release_stage,
3242                                      "/run/user/" UID_FMT "/systemd/propagate/.os-release-stage",
3243                                      geteuid()) < 0)
3244                                 return -ENOMEM;
3245                 }
3246         }
3247
3248         if (root_image) {
3249                 r = verity_settings_prepare(
3250                         &verity,
3251                         root_image,
3252                         context->root_hash, context->root_hash_size, context->root_hash_path,
3253                         context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3254                         context->root_verity);
3255                 if (r < 0)
3256                         return r;
3257         }
3258
3259         NamespaceParameters parameters = {
3260                 .runtime_scope = params->runtime_scope,
3261
3262                 .root_directory = root_dir,
3263                 .root_image = root_image,
3264                 .root_image_options = context->root_image_options,
3265                 .root_image_policy = context->root_image_policy ?: &image_policy_service,
3266
3267                 .read_write_paths = read_write_paths,
3268                 .read_only_paths = needs_sandboxing ? context->read_only_paths : NULL,
3269                 .inaccessible_paths = needs_sandboxing ? context->inaccessible_paths : NULL,
3270
3271                 .exec_paths = needs_sandboxing ? context->exec_paths : NULL,
3272                 .no_exec_paths = needs_sandboxing ? context->no_exec_paths : NULL,
3273
3274                 .empty_directories = empty_directories,
3275                 .symlinks = symlinks,
3276
3277                 .bind_mounts = bind_mounts,
3278                 .n_bind_mounts = n_bind_mounts,
3279
3280                 .temporary_filesystems = context->temporary_filesystems,
3281                 .n_temporary_filesystems = context->n_temporary_filesystems,
3282
3283                 .mount_images = context->mount_images,
3284                 .n_mount_images = context->n_mount_images,
3285                 .mount_image_policy = context->mount_image_policy ?: &image_policy_service,
3286
3287                 .tmp_dir = tmp_dir,
3288                 .var_tmp_dir = var_tmp_dir,
3289
3290                 .creds_path = creds_path,
3291                 .log_namespace = context->log_namespace,
3292                 .mount_propagation_flag = context->mount_propagation_flag,
3293
3294                 .verity = &verity,
3295
3296                 .extension_images = context->extension_images,
3297                 .n_extension_images = context->n_extension_images,
3298                 .extension_image_policy = context->extension_image_policy ?: &image_policy_sysext,
3299                 .extension_directories = context->extension_directories,
3300
3301                 .propagate_dir = propagate_dir,
3302                 .incoming_dir = incoming_dir,
3303                 .extension_dir = extension_dir,
3304                 .notify_socket = root_dir || root_image ? params->notify_socket : NULL,
3305                 .host_os_release_stage = host_os_release_stage,
3306
3307                 /* If DynamicUser=no and RootDirectory= is set then lets pass a relaxed sandbox info,
3308                  * otherwise enforce it, don't ignore protected paths and fail if we are enable to apply the
3309                  * sandbox inside the mount namespace. */
3310                 .ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && root_dir,
3311
3312                 .protect_control_groups = needs_sandboxing && context->protect_control_groups,
3313                 .protect_kernel_tunables = needs_sandboxing && context->protect_kernel_tunables,
3314                 .protect_kernel_modules = needs_sandboxing && context->protect_kernel_modules,
3315                 .protect_kernel_logs = needs_sandboxing && context->protect_kernel_logs,
3316                 .protect_hostname = needs_sandboxing && context->protect_hostname,
3317
3318                 .private_dev = needs_sandboxing && context->private_devices,
3319                 .private_network = needs_sandboxing && exec_needs_network_namespace(context),
3320                 .private_ipc = needs_sandboxing && exec_needs_ipc_namespace(context),
3321
3322                 .mount_apivfs = needs_sandboxing && exec_context_get_effective_mount_apivfs(context),
3323
3324                 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
3325                 .mount_nosuid = needs_sandboxing && context->no_new_privileges && !mac_selinux_use(),
3326
3327                 .protect_home = needs_sandboxing && context->protect_home,
3328                 .protect_system = needs_sandboxing && context->protect_system,
3329                 .protect_proc = needs_sandboxing && context->protect_proc,
3330                 .proc_subset = needs_sandboxing && context->proc_subset,
3331         };
3332
3333         r = setup_namespace(&parameters, error_path);
3334         /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
3335          * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
3336          * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3337          * completely different execution environment. */
3338         if (r == -ENOANO) {
3339                 if (insist_on_sandboxing(
3340                                     context,
3341                                     root_dir, root_image,
3342                                     bind_mounts,
3343                                     n_bind_mounts))
3344                         return log_unit_debug_errno(u,
3345                                                     SYNTHETIC_ERRNO(EOPNOTSUPP),
3346                                                     "Failed to set up namespace, and refusing to continue since "
3347                                                     "the selected namespacing options alter mount environment non-trivially.\n"
3348                                                     "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3349                                                     n_bind_mounts,
3350                                                     context->n_temporary_filesystems,
3351                                                     yes_no(root_dir),
3352                                                     yes_no(root_image),
3353                                                     yes_no(context->dynamic_user));
3354
3355                 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
3356                 return 0;
3357         }
3358
3359         return r;
3360 }
3361
3362 static int apply_working_directory(
3363                 const ExecContext *context,
3364                 const ExecParameters *params,
3365                 ExecRuntime *runtime,
3366                 const char *home,
3367                 int *exit_status) {
3368
3369         const char *d, *wd;
3370
3371         assert(context);
3372         assert(exit_status);
3373
3374         if (context->working_directory_home) {
3375
3376                 if (!home) {
3377                         *exit_status = EXIT_CHDIR;
3378                         return -ENXIO;
3379                 }
3380
3381                 wd = home;
3382
3383         } else
3384                 wd = empty_to_root(context->working_directory);
3385
3386         if (params->flags & EXEC_APPLY_CHROOT)
3387                 d = wd;
3388         else
3389                 d = prefix_roota((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory, wd);
3390
3391         if (chdir(d) < 0 && !context->working_directory_missing_ok) {
3392                 *exit_status = EXIT_CHDIR;
3393                 return -errno;
3394         }
3395
3396         return 0;
3397 }
3398
3399 static int apply_root_directory(
3400                 const ExecContext *context,
3401                 const ExecParameters *params,
3402                 ExecRuntime *runtime,
3403                 const bool needs_mount_ns,
3404                 int *exit_status) {
3405
3406         assert(context);
3407         assert(exit_status);
3408
3409         if (params->flags & EXEC_APPLY_CHROOT)
3410                 if (!needs_mount_ns && context->root_directory)
3411                         if (chroot((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory) < 0) {
3412                                 *exit_status = EXIT_CHROOT;
3413                                 return -errno;
3414                         }
3415
3416         return 0;
3417 }
3418
3419 static int setup_keyring(
3420                 const Unit *u,
3421                 const ExecContext *context,
3422                 const ExecParameters *p,
3423                 uid_t uid, gid_t gid) {
3424
3425         key_serial_t keyring;
3426         int r = 0;
3427         uid_t saved_uid;
3428         gid_t saved_gid;
3429
3430         assert(u);
3431         assert(context);
3432         assert(p);
3433
3434         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3435          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3436          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3437          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3438          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3439          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3440
3441         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3442                 return 0;
3443
3444         /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3445          * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3446          * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3447          * & group is just as nasty as acquiring a reference to the user keyring. */
3448
3449         saved_uid = getuid();
3450         saved_gid = getgid();
3451
3452         if (gid_is_valid(gid) && gid != saved_gid) {
3453                 if (setregid(gid, -1) < 0)
3454                         return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
3455         }
3456
3457         if (uid_is_valid(uid) && uid != saved_uid) {
3458                 if (setreuid(uid, -1) < 0) {
3459                         r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
3460                         goto out;
3461                 }
3462         }
3463
3464         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3465         if (keyring == -1) {
3466                 if (errno == ENOSYS)
3467                         log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
3468                 else if (ERRNO_IS_PRIVILEGE(errno))
3469                         log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
3470                 else if (errno == EDQUOT)
3471                         log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
3472                 else
3473                         r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
3474
3475                 goto out;
3476         }
3477
3478         /* When requested link the user keyring into the session keyring. */
3479         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3480
3481                 if (keyctl(KEYCTL_LINK,
3482                            KEY_SPEC_USER_KEYRING,
3483                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3484                         r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
3485                         goto out;
3486                 }
3487         }
3488
3489         /* Restore uid/gid back */
3490         if (uid_is_valid(uid) && uid != saved_uid) {
3491                 if (setreuid(saved_uid, -1) < 0) {
3492                         r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
3493                         goto out;
3494                 }
3495         }
3496
3497         if (gid_is_valid(gid) && gid != saved_gid) {
3498                 if (setregid(saved_gid, -1) < 0)
3499                         return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
3500         }
3501
3502         /* Populate they keyring with the invocation ID by default, as original saved_uid. */
3503         if (!sd_id128_is_null(u->invocation_id)) {
3504                 key_serial_t key;
3505
3506                 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
3507                 if (key == -1)
3508                         log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
3509                 else {
3510                         if (keyctl(KEYCTL_SETPERM, key,
3511                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3512                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
3513                                 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
3514                 }
3515         }
3516
3517 out:
3518         /* Revert back uid & gid for the last time, and exit */
3519         /* no extra logging, as only the first already reported error matters */
3520         if (getuid() != saved_uid)
3521                 (void) setreuid(saved_uid, -1);
3522
3523         if (getgid() != saved_gid)
3524                 (void) setregid(saved_gid, -1);
3525
3526         return r;
3527 }
3528
3529 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
3530         assert(array);
3531         assert(n);
3532         assert(pair);
3533
3534         if (pair[0] >= 0)
3535                 array[(*n)++] = pair[0];
3536         if (pair[1] >= 0)
3537                 array[(*n)++] = pair[1];
3538 }
3539
3540 static int close_remaining_fds(
3541                 const ExecParameters *params,
3542                 const ExecRuntime *runtime,
3543                 int user_lookup_fd,
3544                 int socket_fd,
3545                 const int *fds, size_t n_fds) {
3546
3547         size_t n_dont_close = 0;
3548         int dont_close[n_fds + 14];
3549
3550         assert(params);
3551
3552         if (params->stdin_fd >= 0)
3553                 dont_close[n_dont_close++] = params->stdin_fd;
3554         if (params->stdout_fd >= 0)
3555                 dont_close[n_dont_close++] = params->stdout_fd;
3556         if (params->stderr_fd >= 0)
3557                 dont_close[n_dont_close++] = params->stderr_fd;
3558
3559         if (socket_fd >= 0)
3560                 dont_close[n_dont_close++] = socket_fd;
3561         if (n_fds > 0) {
3562                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3563                 n_dont_close += n_fds;
3564         }
3565
3566         if (runtime)
3567                 append_socket_pair(dont_close, &n_dont_close, runtime->ephemeral_storage_socket);
3568
3569         if (runtime && runtime->shared) {
3570                 append_socket_pair(dont_close, &n_dont_close, runtime->shared->netns_storage_socket);
3571                 append_socket_pair(dont_close, &n_dont_close, runtime->shared->ipcns_storage_socket);
3572         }
3573
3574         if (runtime && runtime->dynamic_creds) {
3575                 if (runtime->dynamic_creds->user)
3576                         append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->user->storage_socket);
3577                 if (runtime->dynamic_creds->group)
3578                         append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->group->storage_socket);
3579         }
3580
3581         if (user_lookup_fd >= 0)
3582                 dont_close[n_dont_close++] = user_lookup_fd;
3583
3584         return close_all_fds(dont_close, n_dont_close);
3585 }
3586
3587 static int send_user_lookup(
3588                 Unit *unit,
3589                 int user_lookup_fd,
3590                 uid_t uid,
3591                 gid_t gid) {
3592
3593         assert(unit);
3594
3595         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3596          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3597          * specified. */
3598
3599         if (user_lookup_fd < 0)
3600                 return 0;
3601
3602         if (!uid_is_valid(uid) && !gid_is_valid(gid))
3603                 return 0;
3604
3605         if (writev(user_lookup_fd,
3606                (struct iovec[]) {
3607                            IOVEC_MAKE(&uid, sizeof(uid)),
3608                            IOVEC_MAKE(&gid, sizeof(gid)),
3609                            IOVEC_MAKE_STRING(unit->id) }, 3) < 0)
3610                 return -errno;
3611
3612         return 0;
3613 }
3614
3615 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3616         int r;
3617
3618         assert(c);
3619         assert(home);
3620         assert(buf);
3621
3622         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3623
3624         if (*home)
3625                 return 0;
3626
3627         if (!c->working_directory_home)
3628                 return 0;
3629
3630         r = get_home_dir(buf);
3631         if (r < 0)
3632                 return r;
3633
3634         *home = *buf;
3635         return 1;
3636 }
3637
3638 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3639         _cleanup_strv_free_ char ** list = NULL;
3640         int r;
3641
3642         assert(c);
3643         assert(p);
3644         assert(ret);
3645
3646         assert(c->dynamic_user);
3647
3648         /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3649          * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3650          * directories. */
3651
3652         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3653                 if (t == EXEC_DIRECTORY_CONFIGURATION)
3654                         continue;
3655
3656                 if (!p->prefix[t])
3657                         continue;
3658
3659                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
3660                         char *e;
3661
3662                         if (exec_directory_is_private(c, t))
3663                                 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
3664                         else
3665                                 e = path_join(p->prefix[t], c->directories[t].items[i].path);
3666                         if (!e)
3667                                 return -ENOMEM;
3668
3669                         r = strv_consume(&list, e);
3670                         if (r < 0)
3671                                 return r;
3672                 }
3673         }
3674
3675         *ret = TAKE_PTR(list);
3676
3677         return 0;
3678 }
3679
3680 static int exec_parameters_get_cgroup_path(
3681                 const ExecParameters *params,
3682                 const CGroupContext *c,
3683                 char **ret) {
3684
3685         const char *subgroup = NULL;
3686         char *p;
3687
3688         assert(params);
3689         assert(ret);
3690
3691         if (!params->cgroup_path)
3692                 return -EINVAL;
3693
3694         /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
3695          * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
3696          * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
3697          * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
3698          * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
3699          * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
3700          * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
3701          * flag, which is only passed for the former statements, not for the latter. */
3702
3703         if (FLAGS_SET(params->flags, EXEC_CGROUP_DELEGATE) && (FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP) || c->delegate_subgroup)) {
3704                 if (FLAGS_SET(params->flags, EXEC_IS_CONTROL))
3705                         subgroup = ".control";
3706                 else
3707                         subgroup = c->delegate_subgroup;
3708         }
3709
3710         if (subgroup)
3711                 p = path_join(params->cgroup_path, subgroup);
3712         else
3713                 p = strdup(params->cgroup_path);
3714         if (!p)
3715                 return -ENOMEM;
3716
3717         *ret = p;
3718         return !!subgroup;
3719 }
3720
3721 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
3722         _cleanup_(cpu_set_reset) CPUSet s = {};
3723         int r;
3724
3725         assert(c);
3726         assert(ret);
3727
3728         if (!c->numa_policy.nodes.set) {
3729                 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3730                 return 0;
3731         }
3732
3733         r = numa_to_cpu_set(&c->numa_policy, &s);
3734         if (r < 0)
3735                 return r;
3736
3737         cpu_set_reset(ret);
3738
3739         return cpu_set_add_all(ret, &s);
3740 }
3741
3742 bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
3743         assert(c);
3744
3745         return c->cpu_affinity_from_numa;
3746 }
3747
3748 static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
3749         int r;
3750
3751         assert(fds);
3752         assert(n_fds);
3753         assert(*n_fds < fds_size);
3754         assert(ret_fd);
3755
3756         if (fd < 0) {
3757                 *ret_fd = -EBADF;
3758                 return 0;
3759         }
3760
3761         if (fd < 3 + (int) *n_fds) {
3762                 /* Let's move the fd up, so that it's outside of the fd range we will use to store
3763                  * the fds we pass to the process (or which are closed only during execve). */
3764
3765                 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
3766                 if (r < 0)
3767                         return -errno;
3768
3769                 close_and_replace(fd, r);
3770         }
3771
3772         *ret_fd = fds[*n_fds] = fd;
3773         (*n_fds) ++;
3774         return 1;
3775 }
3776
3777 static int connect_unix_harder(Unit *u, const OpenFile *of, int ofd) {
3778         union sockaddr_union addr = {
3779                 .un.sun_family = AF_UNIX,
3780         };
3781         socklen_t sa_len;
3782         static const int socket_types[] = { SOCK_DGRAM, SOCK_STREAM, SOCK_SEQPACKET };
3783         int r;
3784
3785         assert(u);
3786         assert(of);
3787         assert(ofd >= 0);
3788
3789         r = sockaddr_un_set_path(&addr.un, FORMAT_PROC_FD_PATH(ofd));
3790         if (r < 0)
3791                 return log_unit_error_errno(u, r, "Failed to set sockaddr for %s: %m", of->path);
3792
3793         sa_len = r;
3794
3795         for (size_t i = 0; i < ELEMENTSOF(socket_types); i++) {
3796                 _cleanup_close_ int fd = -EBADF;
3797
3798                 fd = socket(AF_UNIX, socket_types[i] | SOCK_CLOEXEC, 0);
3799                 if (fd < 0)
3800                         return log_unit_error_errno(u, errno, "Failed to create socket for %s: %m", of->path);
3801
3802                 r = RET_NERRNO(connect(fd, &addr.sa, sa_len));
3803                 if (r == -EPROTOTYPE)
3804                         continue;
3805                 if (r < 0)
3806                         return log_unit_error_errno(u, r, "Failed to connect socket for %s: %m", of->path);
3807
3808                 return TAKE_FD(fd);
3809         }
3810
3811         return log_unit_error_errno(u, SYNTHETIC_ERRNO(EPROTOTYPE), "Failed to connect socket for \"%s\".", of->path);
3812 }
3813
3814 static int get_open_file_fd(Unit *u, const OpenFile *of) {
3815         struct stat st;
3816         _cleanup_close_ int fd = -EBADF, ofd = -EBADF;
3817
3818         assert(u);
3819         assert(of);
3820
3821         ofd = open(of->path, O_PATH | O_CLOEXEC);
3822         if (ofd < 0)
3823                 return log_unit_error_errno(u, errno, "Could not open \"%s\": %m", of->path);
3824
3825         if (fstat(ofd, &st) < 0)
3826                 return log_unit_error_errno(u, errno, "Failed to stat %s: %m", of->path);
3827
3828         if (S_ISSOCK(st.st_mode)) {
3829                 fd = connect_unix_harder(u, of, ofd);
3830                 if (fd < 0)
3831                         return fd;
3832
3833                 if (FLAGS_SET(of->flags, OPENFILE_READ_ONLY) && shutdown(fd, SHUT_WR) < 0)
3834                         return log_unit_error_errno(u, errno, "Failed to shutdown send for socket %s: %m",
3835                                                     of->path);
3836
3837                 log_unit_debug(u, "socket %s opened (fd=%d)", of->path, fd);
3838         } else {
3839                 int flags = FLAGS_SET(of->flags, OPENFILE_READ_ONLY) ? O_RDONLY : O_RDWR;
3840                 if (FLAGS_SET(of->flags, OPENFILE_APPEND))
3841                         flags |= O_APPEND;
3842                 else if (FLAGS_SET(of->flags, OPENFILE_TRUNCATE))
3843                         flags |= O_TRUNC;
3844
3845                 fd = fd_reopen(ofd, flags | O_CLOEXEC);
3846                 if (fd < 0)
3847                         return log_unit_error_errno(u, fd, "Failed to open file %s: %m", of->path);
3848
3849                 log_unit_debug(u, "file %s opened (fd=%d)", of->path, fd);
3850         }
3851
3852         return TAKE_FD(fd);
3853 }
3854
3855 static int collect_open_file_fds(
3856                 Unit *u,
3857                 OpenFile* open_files,
3858                 int **fds,
3859                 char ***fdnames,
3860                 size_t *n_fds) {
3861         int r;
3862
3863         assert(u);
3864         assert(fds);
3865         assert(fdnames);
3866         assert(n_fds);
3867
3868         LIST_FOREACH(open_files, of, open_files) {
3869                 _cleanup_close_ int fd = -EBADF;
3870
3871                 fd = get_open_file_fd(u, of);
3872                 if (fd < 0) {
3873                         if (FLAGS_SET(of->flags, OPENFILE_GRACEFUL)) {
3874                                 log_unit_debug_errno(u, fd, "Failed to get OpenFile= file descriptor for %s, ignoring: %m", of->path);
3875                                 continue;
3876                         }
3877
3878                         return fd;
3879                 }
3880
3881                 if (!GREEDY_REALLOC(*fds, *n_fds + 1))
3882                         return -ENOMEM;
3883
3884                 r = strv_extend(fdnames, of->fdname);
3885                 if (r < 0)
3886                         return r;
3887
3888                 (*fds)[*n_fds] = TAKE_FD(fd);
3889
3890                 (*n_fds)++;
3891         }
3892
3893         return 0;
3894 }
3895
3896 static void log_command_line(Unit *unit, const char *msg, const char *executable, char **argv) {
3897         assert(unit);
3898         assert(msg);
3899         assert(executable);
3900
3901         if (!DEBUG_LOGGING)
3902                 return;
3903
3904         _cleanup_free_ char *cmdline = quote_command_line(argv, SHELL_ESCAPE_EMPTY);
3905
3906         log_unit_struct(unit, LOG_DEBUG,
3907                         "EXECUTABLE=%s", executable,
3908                         LOG_UNIT_MESSAGE(unit, "%s: %s", msg, strnull(cmdline)),
3909                         LOG_UNIT_INVOCATION_ID(unit));
3910 }
3911
3912 static bool exec_context_need_unprivileged_private_users(
3913                 const ExecContext *context,
3914                 const ExecParameters *params) {
3915
3916         assert(context);
3917         assert(params);
3918
3919         /* These options require PrivateUsers= when used in user units, as we need to be in a user namespace
3920          * to have permission to enable them when not running as root. If we have effective CAP_SYS_ADMIN
3921          * (system manager) then we have privileges and don't need this. */
3922         if (params->runtime_scope != RUNTIME_SCOPE_USER)
3923                 return false;
3924
3925         return context->private_users ||
3926                context->private_tmp ||
3927                context->private_devices ||
3928                context->private_network ||
3929                context->network_namespace_path ||
3930                context->private_ipc ||
3931                context->ipc_namespace_path ||
3932                context->private_mounts > 0 ||
3933                context->mount_apivfs ||
3934                context->n_bind_mounts > 0 ||
3935                context->n_temporary_filesystems > 0 ||
3936                context->root_directory ||
3937                !strv_isempty(context->extension_directories) ||
3938                context->protect_system != PROTECT_SYSTEM_NO ||
3939                context->protect_home != PROTECT_HOME_NO ||
3940                context->protect_kernel_tunables ||
3941                context->protect_kernel_modules ||
3942                context->protect_kernel_logs ||
3943                context->protect_control_groups ||
3944                context->protect_clock ||
3945                context->protect_hostname ||
3946                !strv_isempty(context->read_write_paths) ||
3947                !strv_isempty(context->read_only_paths) ||
3948                !strv_isempty(context->inaccessible_paths) ||
3949                !strv_isempty(context->exec_paths) ||
3950                !strv_isempty(context->no_exec_paths);
3951 }
3952
3953 static int exec_child(
3954                 Unit *unit,
3955                 const ExecCommand *command,
3956                 const ExecContext *context,
3957                 const ExecParameters *params,
3958                 ExecRuntime *runtime,
3959                 const CGroupContext *cgroup_context,
3960                 int socket_fd,
3961                 const int named_iofds[static 3],
3962                 int *params_fds,
3963                 size_t n_socket_fds,
3964                 size_t n_storage_fds,
3965                 char **files_env,
3966                 int user_lookup_fd,
3967                 int *exit_status) {
3968
3969         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
3970         int r, ngids = 0, exec_fd;
3971         _cleanup_free_ gid_t *supplementary_gids = NULL;
3972         const char *username = NULL, *groupname = NULL;
3973         _cleanup_free_ char *home_buffer = NULL, *memory_pressure_path = NULL;
3974         const char *home = NULL, *shell = NULL;
3975         char **final_argv = NULL;
3976         dev_t journal_stream_dev = 0;
3977         ino_t journal_stream_ino = 0;
3978         bool userns_set_up = false;
3979         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
3980                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
3981                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
3982                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
3983 #if HAVE_SELINUX
3984         _cleanup_free_ char *mac_selinux_context_net = NULL;
3985         bool use_selinux = false;
3986 #endif
3987 #if ENABLE_SMACK
3988         bool use_smack = false;
3989 #endif
3990 #if HAVE_APPARMOR
3991         bool use_apparmor = false;
3992 #endif
3993         uid_t saved_uid = getuid();
3994         gid_t saved_gid = getgid();
3995         uid_t uid = UID_INVALID;
3996         gid_t gid = GID_INVALID;
3997         size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
3998                n_keep_fds; /* total number of fds not to close */
3999         int secure_bits;
4000         _cleanup_free_ gid_t *gids_after_pam = NULL;
4001         int ngids_after_pam = 0;
4002         _cleanup_free_ int *fds = NULL;
4003         _cleanup_strv_free_ char **fdnames = NULL;
4004
4005         assert(unit);
4006         assert(command);
4007         assert(context);
4008         assert(params);
4009         assert(exit_status);
4010
4011         /* Explicitly test for CVE-2021-4034 inspired invocations */
4012         assert(command->path);
4013         assert(!strv_isempty(command->argv));
4014
4015         rename_process_from_path(command->path);
4016
4017         /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4018          * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4019          * both of which will be demoted to SIG_DFL. */
4020         (void) default_signals(SIGNALS_CRASH_HANDLER,
4021                                SIGNALS_IGNORE);
4022
4023         if (context->ignore_sigpipe)
4024                 (void) ignore_signals(SIGPIPE);
4025
4026         r = reset_signal_mask();
4027         if (r < 0) {
4028                 *exit_status = EXIT_SIGNAL_MASK;
4029                 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
4030         }
4031
4032         if (params->idle_pipe)
4033                 do_idle_pipe_dance(params->idle_pipe);
4034
4035         /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4036          * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4037          * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4038          * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
4039
4040         log_forget_fds();
4041         log_set_open_when_needed(true);
4042         log_settle_target();
4043         if (context->log_level_max >= 0)
4044                 log_set_max_level(context->log_level_max);
4045
4046         /* In case anything used libc syslog(), close this here, too */
4047         closelog();
4048
4049         fds = newdup(int, params_fds, n_fds);
4050         if (!fds) {
4051                 *exit_status = EXIT_MEMORY;
4052                 return log_oom();
4053         }
4054
4055         fdnames = strv_copy((char**) params->fd_names);
4056         if (!fdnames) {
4057                 *exit_status = EXIT_MEMORY;
4058                 return log_oom();
4059         }
4060
4061         r = collect_open_file_fds(unit, params->open_files, &fds, &fdnames, &n_fds);
4062         if (r < 0) {
4063                 *exit_status = EXIT_FDS;
4064                 return log_unit_error_errno(unit, r, "Failed to get OpenFile= file descriptors: %m");
4065         }
4066
4067         int keep_fds[n_fds + 3];
4068         memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
4069         n_keep_fds = n_fds;
4070
4071         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
4072         if (r < 0) {
4073                 *exit_status = EXIT_FDS;
4074                 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4075         }
4076
4077 #if HAVE_LIBBPF
4078         if (unit->manager->restrict_fs) {
4079                 int bpf_map_fd = lsm_bpf_map_restrict_fs_fd(unit);
4080                 if (bpf_map_fd < 0) {
4081                         *exit_status = EXIT_FDS;
4082                         return log_unit_error_errno(unit, bpf_map_fd, "Failed to get restrict filesystems BPF map fd: %m");
4083                 }
4084
4085                 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, bpf_map_fd, &bpf_map_fd);
4086                 if (r < 0) {
4087                         *exit_status = EXIT_FDS;
4088                         return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4089                 }
4090         }
4091 #endif
4092
4093         r = close_remaining_fds(params, runtime, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
4094         if (r < 0) {
4095                 *exit_status = EXIT_FDS;
4096                 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
4097         }
4098
4099         if (!context->same_pgrp &&
4100             setsid() < 0) {
4101                 *exit_status = EXIT_SETSID;
4102                 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
4103         }
4104
4105         exec_context_tty_reset(context, params);
4106
4107         if (unit_shall_confirm_spawn(unit)) {
4108                 _cleanup_free_ char *cmdline = NULL;
4109
4110                 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
4111                 if (!cmdline) {
4112                         *exit_status = EXIT_MEMORY;
4113                         return log_oom();
4114                 }
4115
4116                 r = ask_for_confirmation(context, params->confirm_spawn, unit, cmdline);
4117                 if (r != CONFIRM_EXECUTE) {
4118                         if (r == CONFIRM_PRETEND_SUCCESS) {
4119                                 *exit_status = EXIT_SUCCESS;
4120                                 return 0;
4121                         }
4122
4123                         *exit_status = EXIT_CONFIRM;
4124                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
4125                                                     "Execution cancelled by the user");
4126                 }
4127         }
4128
4129         /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4130          * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4131          * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4132          * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4133          * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4134         if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
4135             setenv("SYSTEMD_ACTIVATION_SCOPE", runtime_scope_to_string(params->runtime_scope), true) != 0) {
4136                 *exit_status = EXIT_MEMORY;
4137                 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4138         }
4139
4140         if (context->dynamic_user && runtime && runtime->dynamic_creds) {
4141                 _cleanup_strv_free_ char **suggested_paths = NULL;
4142
4143                 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
4144                  * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
4145                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4146                         *exit_status = EXIT_USER;
4147                         return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4148                 }
4149
4150                 r = compile_suggested_paths(context, params, &suggested_paths);
4151                 if (r < 0) {
4152                         *exit_status = EXIT_MEMORY;
4153                         return log_oom();
4154                 }
4155
4156                 r = dynamic_creds_realize(runtime->dynamic_creds, suggested_paths, &uid, &gid);
4157                 if (r < 0) {
4158                         *exit_status = EXIT_USER;
4159                         if (r == -EILSEQ)
4160                                 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4161                                                             "Failed to update dynamic user credentials: User or group with specified name already exists.");
4162                         return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
4163                 }
4164
4165                 if (!uid_is_valid(uid)) {
4166                         *exit_status = EXIT_USER;
4167                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
4168                 }
4169
4170                 if (!gid_is_valid(gid)) {
4171                         *exit_status = EXIT_USER;
4172                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
4173                 }
4174
4175                 if (runtime->dynamic_creds->user)
4176                         username = runtime->dynamic_creds->user->name;
4177
4178         } else {
4179                 if (context->user) {
4180                         r = get_fixed_user(context->user, &username, &uid, &gid, &home, &shell);
4181                         if (r < 0) {
4182                                 *exit_status = EXIT_USER;
4183                                 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
4184                         }
4185                 }
4186
4187                 if (context->group) {
4188                         r = get_fixed_group(context->group, &groupname, &gid);
4189                         if (r < 0) {
4190                                 *exit_status = EXIT_GROUP;
4191                                 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
4192                         }
4193                 }
4194         }
4195
4196         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4197         r = get_supplementary_groups(context, username, groupname, gid,
4198                                      &supplementary_gids, &ngids);
4199         if (r < 0) {
4200                 *exit_status = EXIT_GROUP;
4201                 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
4202         }
4203
4204         r = send_user_lookup(unit, user_lookup_fd, uid, gid);
4205         if (r < 0) {
4206                 *exit_status = EXIT_USER;
4207                 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
4208         }
4209
4210         user_lookup_fd = safe_close(user_lookup_fd);
4211
4212         r = acquire_home(context, uid, &home, &home_buffer);
4213         if (r < 0) {
4214                 *exit_status = EXIT_CHDIR;
4215                 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
4216         }
4217
4218         /* If a socket is connected to STDIN/STDOUT/STDERR, we must drop O_NONBLOCK */
4219         if (socket_fd >= 0)
4220                 (void) fd_nonblock(socket_fd, false);
4221
4222         /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4223          * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4224         if (params->cgroup_path) {
4225                 _cleanup_free_ char *p = NULL;
4226
4227                 r = exec_parameters_get_cgroup_path(params, cgroup_context, &p);
4228                 if (r < 0) {
4229                         *exit_status = EXIT_CGROUP;
4230                         return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
4231                 }
4232
4233                 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
4234                 if (r == -EUCLEAN) {
4235                         *exit_status = EXIT_CGROUP;
4236                         return log_unit_error_errno(unit, r, "Failed to attach process to cgroup %s "
4237                                                     "because the cgroup or one of its parents or "
4238                                                     "siblings is in the threaded mode: %m", p);
4239                 }
4240                 if (r < 0) {
4241                         *exit_status = EXIT_CGROUP;
4242                         return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
4243                 }
4244         }
4245
4246         if (context->network_namespace_path && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
4247                 r = open_shareable_ns_path(runtime->shared->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
4248                 if (r < 0) {
4249                         *exit_status = EXIT_NETWORK;
4250                         return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
4251                 }
4252         }
4253
4254         if (context->ipc_namespace_path && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
4255                 r = open_shareable_ns_path(runtime->shared->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
4256                 if (r < 0) {
4257                         *exit_status = EXIT_NAMESPACE;
4258                         return log_unit_error_errno(unit, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
4259                 }
4260         }
4261
4262         r = setup_input(context, params, socket_fd, named_iofds);
4263         if (r < 0) {
4264                 *exit_status = EXIT_STDIN;
4265                 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
4266         }
4267
4268         r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4269         if (r < 0) {
4270                 *exit_status = EXIT_STDOUT;
4271                 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
4272         }
4273
4274         r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4275         if (r < 0) {
4276                 *exit_status = EXIT_STDERR;
4277                 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
4278         }
4279
4280         if (context->oom_score_adjust_set) {
4281                 /* When we can't make this change due to EPERM, then let's silently skip over it. User
4282                  * namespaces prohibit write access to this file, and we shouldn't trip up over that. */
4283                 r = set_oom_score_adjust(context->oom_score_adjust);
4284                 if (ERRNO_IS_NEG_PRIVILEGE(r))
4285                         log_unit_debug_errno(unit, r,
4286                                              "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
4287                 else if (r < 0) {
4288                         *exit_status = EXIT_OOM_ADJUST;
4289                         return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
4290                 }
4291         }
4292
4293         if (context->coredump_filter_set) {
4294                 r = set_coredump_filter(context->coredump_filter);
4295                 if (ERRNO_IS_NEG_PRIVILEGE(r))
4296                         log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
4297                 else if (r < 0) {
4298                         *exit_status = EXIT_LIMITS;
4299                         return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
4300                 }
4301         }
4302
4303         if (context->nice_set) {
4304                 r = setpriority_closest(context->nice);
4305                 if (r < 0) {
4306                         *exit_status = EXIT_NICE;
4307                         return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
4308                 }
4309         }
4310
4311         if (context->cpu_sched_set) {
4312                 struct sched_param param = {
4313                         .sched_priority = context->cpu_sched_priority,
4314                 };
4315
4316                 r = sched_setscheduler(0,
4317                                        context->cpu_sched_policy |
4318                                        (context->cpu_sched_reset_on_fork ?
4319                                         SCHED_RESET_ON_FORK : 0),
4320                                        &param);
4321                 if (r < 0) {
4322                         *exit_status = EXIT_SETSCHEDULER;
4323                         return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
4324                 }
4325         }
4326
4327         if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4328                 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4329                 const CPUSet *cpu_set;
4330
4331                 if (context->cpu_affinity_from_numa) {
4332                         r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4333                         if (r < 0) {
4334                                 *exit_status = EXIT_CPUAFFINITY;
4335                                 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4336                         }
4337
4338                         cpu_set = &converted_cpu_set;
4339                 } else
4340                         cpu_set = &context->cpu_set;
4341
4342                 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
4343                         *exit_status = EXIT_CPUAFFINITY;
4344                         return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
4345                 }
4346         }
4347
4348         if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4349                 r = apply_numa_policy(&context->numa_policy);
4350                 if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
4351                         log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
4352                 else if (r < 0) {
4353                         *exit_status = EXIT_NUMA_POLICY;
4354                         return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
4355                 }
4356         }
4357
4358         if (context->ioprio_set)
4359                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
4360                         *exit_status = EXIT_IOPRIO;
4361                         return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
4362                 }
4363
4364         if (context->timer_slack_nsec != NSEC_INFINITY)
4365                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
4366                         *exit_status = EXIT_TIMERSLACK;
4367                         return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
4368                 }
4369
4370         if (context->personality != PERSONALITY_INVALID) {
4371                 r = safe_personality(context->personality);
4372                 if (r < 0) {
4373                         *exit_status = EXIT_PERSONALITY;
4374                         return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
4375                 }
4376         }
4377
4378         if (context->utmp_id) {
4379                 const char *line = context->tty_path ?
4380                         (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
4381                         NULL;
4382                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
4383                                       line,
4384                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
4385                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4386                                       USER_PROCESS,
4387                                       username);
4388         }
4389
4390         if (uid_is_valid(uid)) {
4391                 r = chown_terminal(STDIN_FILENO, uid);
4392                 if (r < 0) {
4393                         *exit_status = EXIT_STDIN;
4394                         return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
4395                 }
4396         }
4397
4398         if (params->cgroup_path) {
4399                 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
4400                  * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4401                  * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
4402                  * touch a single hierarchy too. */
4403
4404                 if (params->flags & EXEC_CGROUP_DELEGATE) {
4405                         _cleanup_free_ char *p = NULL;
4406
4407                         r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
4408                         if (r < 0) {
4409                                 *exit_status = EXIT_CGROUP;
4410                                 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
4411                         }
4412
4413                         r = exec_parameters_get_cgroup_path(params, cgroup_context, &p);
4414                         if (r < 0) {
4415                                 *exit_status = EXIT_CGROUP;
4416                                 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
4417                         }
4418                         if (r > 0) {
4419                                 r = cg_set_access_recursive(SYSTEMD_CGROUP_CONTROLLER, p, uid, gid);
4420                                 if (r < 0) {
4421                                         *exit_status = EXIT_CGROUP;
4422                                         return log_unit_error_errno(unit, r, "Failed to adjust control subgroup access: %m");
4423                                 }
4424                         }
4425                 }
4426
4427                 if (cgroup_context && cg_unified() > 0 && is_pressure_supported() > 0) {
4428                         if (cgroup_context_want_memory_pressure(cgroup_context)) {
4429                                 r = cg_get_path("memory", params->cgroup_path, "memory.pressure", &memory_pressure_path);
4430                                 if (r < 0) {
4431                                         *exit_status = EXIT_MEMORY;
4432                                         return log_oom();
4433                                 }
4434
4435                                 r = chmod_and_chown(memory_pressure_path, 0644, uid, gid);
4436                                 if (r < 0) {
4437                                         log_unit_full_errno(unit, r == -ENOENT || ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
4438                                                             "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path);
4439                                         memory_pressure_path = mfree(memory_pressure_path);
4440                                 }
4441                         } else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_OFF) {
4442                                 memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
4443                                 if (!memory_pressure_path) {
4444                                         *exit_status = EXIT_MEMORY;
4445                                         return log_oom();
4446                                 }
4447                         }
4448                 }
4449         }
4450
4451         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
4452
4453         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4454                 r = setup_exec_directory(unit, context, params, uid, gid, dt, needs_mount_namespace, exit_status);
4455                 if (r < 0)
4456                         return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
4457         }
4458
4459         if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4460                 r = exec_setup_credentials(context, params, unit->id, uid, gid);
4461                 if (r < 0) {
4462                         *exit_status = EXIT_CREDENTIALS;
4463                         return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
4464                 }
4465         }
4466
4467         r = build_environment(
4468                         unit,
4469                         context,
4470                         params,
4471                         cgroup_context,
4472                         n_fds,
4473                         fdnames,
4474                         home,
4475                         username,
4476                         shell,
4477                         journal_stream_dev,
4478                         journal_stream_ino,
4479                         memory_pressure_path,
4480                         &our_env);
4481         if (r < 0) {
4482                 *exit_status = EXIT_MEMORY;
4483                 return log_oom();
4484         }
4485
4486         r = build_pass_environment(context, &pass_env);
4487         if (r < 0) {
4488                 *exit_status = EXIT_MEMORY;
4489                 return log_oom();
4490         }
4491
4492         /* The $PATH variable is set to the default path in params->environment. However, this is overridden
4493          * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
4494          * not specify PATH but the unit has ExecSearchPath. */
4495         if (!strv_isempty(context->exec_search_path)) {
4496                 _cleanup_free_ char *joined = NULL;
4497
4498                 joined = strv_join(context->exec_search_path, ":");
4499                 if (!joined) {
4500                         *exit_status = EXIT_MEMORY;
4501                         return log_oom();
4502                 }
4503
4504                 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
4505                 if (r < 0) {
4506                         *exit_status = EXIT_MEMORY;
4507                         return log_oom();
4508                 }
4509         }
4510
4511         accum_env = strv_env_merge(params->environment,
4512                                    our_env,
4513                                    joined_exec_search_path,
4514                                    pass_env,
4515                                    context->environment,
4516                                    files_env);
4517         if (!accum_env) {
4518                 *exit_status = EXIT_MEMORY;
4519                 return log_oom();
4520         }
4521         accum_env = strv_env_clean(accum_env);
4522
4523         (void) umask(context->umask);
4524
4525         r = setup_keyring(unit, context, params, uid, gid);
4526         if (r < 0) {
4527                 *exit_status = EXIT_KEYRING;
4528                 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
4529         }
4530
4531         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
4532          * from it. */
4533         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
4534
4535         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
4536          * for it, and the kernel doesn't actually support ambient caps. */
4537         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
4538
4539         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
4540          * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
4541          * desired. */
4542         if (needs_ambient_hack)
4543                 needs_setuid = false;
4544         else
4545                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4546
4547         uint64_t capability_ambient_set = context->capability_ambient_set;
4548
4549         if (needs_sandboxing) {
4550                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
4551                  * /sys being present. The actual MAC context application will happen later, as late as
4552                  * possible, to avoid impacting our own code paths. */
4553
4554 #if HAVE_SELINUX
4555                 use_selinux = mac_selinux_use();
4556 #endif
4557 #if ENABLE_SMACK
4558                 use_smack = mac_smack_use();
4559 #endif
4560 #if HAVE_APPARMOR
4561                 use_apparmor = mac_apparmor_use();
4562 #endif
4563         }
4564
4565         if (needs_sandboxing) {
4566                 int which_failed;
4567
4568                 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4569                  * is set here. (See below.) */
4570
4571                 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4572                 if (r < 0) {
4573                         *exit_status = EXIT_LIMITS;
4574                         return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4575                 }
4576         }
4577
4578         if (needs_setuid && context->pam_name && username) {
4579                 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
4580                  * wins here. (See above.) */
4581
4582                 /* All fds passed in the fds array will be closed in the pam child process. */
4583                 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
4584                 if (r < 0) {
4585                         *exit_status = EXIT_PAM;
4586                         return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
4587                 }
4588
4589                 if (ambient_capabilities_supported()) {
4590                         uint64_t ambient_after_pam;
4591
4592                         /* PAM modules might have set some ambient caps. Query them here and merge them into
4593                          * the caps we want to set in the end, so that we don't end up unsetting them. */
4594                         r = capability_get_ambient(&ambient_after_pam);
4595                         if (r < 0) {
4596                                 *exit_status = EXIT_CAPABILITIES;
4597                                 return log_unit_error_errno(unit, r, "Failed to query ambient caps: %m");
4598                         }
4599
4600                         capability_ambient_set |= ambient_after_pam;
4601                 }
4602
4603                 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4604                 if (ngids_after_pam < 0) {
4605                         *exit_status = EXIT_MEMORY;
4606                         return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
4607                 }
4608         }
4609
4610         if (needs_sandboxing && exec_context_need_unprivileged_private_users(context, params)) {
4611                 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4612                  * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4613                  * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
4614
4615                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4616                 /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
4617                  * the actual requested operations fail (or silently continue). */
4618                 if (r < 0 && context->private_users) {
4619                         *exit_status = EXIT_USER;
4620                         return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
4621                 }
4622                 if (r < 0)
4623                         log_unit_info_errno(unit, r, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
4624                 else
4625                         userns_set_up = true;
4626         }
4627
4628         if (exec_needs_network_namespace(context) && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
4629
4630                 /* Try to enable network namespacing if network namespacing is available and we have
4631                  * CAP_NET_ADMIN. We need CAP_NET_ADMIN to be able to configure the loopback device in the
4632                  * new network namespace. And if we don't have that, then we could only create a network
4633                  * namespace without the ability to set up "lo". Hence gracefully skip things then. */
4634                 if (ns_type_supported(NAMESPACE_NET) && have_effective_cap(CAP_NET_ADMIN) > 0) {
4635                         r = setup_shareable_ns(runtime->shared->netns_storage_socket, CLONE_NEWNET);
4636                         if (ERRNO_IS_NEG_PRIVILEGE(r))
4637                                 log_unit_notice_errno(unit, r,
4638                                                       "PrivateNetwork=yes is configured, but network namespace setup not permitted, proceeding without: %m");
4639                         else if (r < 0) {
4640                                 *exit_status = EXIT_NETWORK;
4641                                 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
4642                         }
4643                 } else if (context->network_namespace_path) {
4644                         *exit_status = EXIT_NETWORK;
4645                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4646                                                     "NetworkNamespacePath= is not supported, refusing.");
4647                 } else
4648                         log_unit_notice(unit, "PrivateNetwork=yes is configured, but the kernel does not support or we lack privileges for network namespace, proceeding without.");
4649         }
4650
4651         if (exec_needs_ipc_namespace(context) && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
4652
4653                 if (ns_type_supported(NAMESPACE_IPC)) {
4654                         r = setup_shareable_ns(runtime->shared->ipcns_storage_socket, CLONE_NEWIPC);
4655                         if (r == -EPERM)
4656                                 log_unit_warning_errno(unit, r,
4657                                                        "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4658                         else if (r < 0) {
4659                                 *exit_status = EXIT_NAMESPACE;
4660                                 return log_unit_error_errno(unit, r, "Failed to set up IPC namespacing: %m");
4661                         }
4662                 } else if (context->ipc_namespace_path) {
4663                         *exit_status = EXIT_NAMESPACE;
4664                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4665                                                     "IPCNamespacePath= is not supported, refusing.");
4666                 } else
4667                         log_unit_warning(unit, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4668         }
4669
4670         if (needs_mount_namespace) {
4671                 _cleanup_free_ char *error_path = NULL;
4672
4673                 r = apply_mount_namespace(unit, command->flags, context, params, runtime, memory_pressure_path, &error_path);
4674                 if (r < 0) {
4675                         *exit_status = EXIT_NAMESPACE;
4676                         return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
4677                                                     error_path ? ": " : "", strempty(error_path));
4678                 }
4679         }
4680
4681         if (needs_sandboxing) {
4682                 r = apply_protect_hostname(unit, context, exit_status);
4683                 if (r < 0)
4684                         return r;
4685         }
4686
4687         if (context->memory_ksm >= 0)
4688                 if (prctl(PR_SET_MEMORY_MERGE, context->memory_ksm) < 0) {
4689                         if (ERRNO_IS_NOT_SUPPORTED(errno))
4690                                 log_unit_debug_errno(unit, errno, "KSM support not available, ignoring.");
4691                         else {
4692                                 *exit_status = EXIT_KSM;
4693                                 return log_unit_error_errno(unit, errno, "Failed to set KSM: %m");
4694                         }
4695                 }
4696
4697         /* Drop groups as early as possible.
4698          * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
4699          * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
4700         if (needs_setuid) {
4701                 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4702                 int ngids_to_enforce = 0;
4703
4704                 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4705                                                    ngids,
4706                                                    gids_after_pam,
4707                                                    ngids_after_pam,
4708                                                    &gids_to_enforce);
4709                 if (ngids_to_enforce < 0) {
4710                         *exit_status = EXIT_MEMORY;
4711                         return log_unit_error_errno(unit,
4712                                                     ngids_to_enforce,
4713                                                     "Failed to merge group lists. Group membership might be incorrect: %m");
4714                 }
4715
4716                 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
4717                 if (r < 0) {
4718                         *exit_status = EXIT_GROUP;
4719                         return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
4720                 }
4721         }
4722
4723         /* If the user namespace was not set up above, try to do it now.
4724          * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4725          * restricted by rules pertaining to combining user namespaces with other namespaces (e.g. in the
4726          * case of mount namespaces being less privileged when the mount point list is copied from a
4727          * different user namespace). */
4728
4729         if (needs_sandboxing && context->private_users && !userns_set_up) {
4730                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4731                 if (r < 0) {
4732                         *exit_status = EXIT_USER;
4733                         return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
4734                 }
4735         }
4736
4737         /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4738          * shall execute. */
4739
4740         _cleanup_free_ char *executable = NULL;
4741         _cleanup_close_ int executable_fd = -EBADF;
4742         r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
4743         if (r < 0) {
4744                 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
4745                         log_unit_struct_errno(unit, LOG_INFO, r,
4746                                               "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4747                                               LOG_UNIT_INVOCATION_ID(unit),
4748                                               LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
4749                                                                command->path),
4750                                               "EXECUTABLE=%s", command->path);
4751                         *exit_status = EXIT_SUCCESS;
4752                         return 0;
4753                 }
4754
4755                 *exit_status = EXIT_EXEC;
4756                 return log_unit_struct_errno(unit, LOG_INFO, r,
4757                                              "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4758                                              LOG_UNIT_INVOCATION_ID(unit),
4759                                              LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
4760                                                               command->path),
4761                                              "EXECUTABLE=%s", command->path);
4762         }
4763
4764         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
4765         if (r < 0) {
4766                 *exit_status = EXIT_FDS;
4767                 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4768         }
4769
4770 #if HAVE_SELINUX
4771         if (needs_sandboxing && use_selinux && params->selinux_context_net) {
4772                 int fd = -EBADF;
4773
4774                 if (socket_fd >= 0)
4775                         fd = socket_fd;
4776                 else if (params->n_socket_fds == 1)
4777                         /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4778                          * use context from that fd to compute the label. */
4779                         fd = params->fds[0];
4780
4781                 if (fd >= 0) {
4782                         r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
4783                         if (r < 0) {
4784                                 if (!context->selinux_context_ignore) {
4785                                         *exit_status = EXIT_SELINUX_CONTEXT;
4786                                         return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
4787                                 }
4788                                 log_unit_debug_errno(unit, r, "Failed to determine SELinux context, ignoring: %m");
4789                         }
4790                 }
4791         }
4792 #endif
4793
4794         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that
4795          * we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any
4796          * more. We do keep exec_fd however, if we have it, since we need to keep it open until the final
4797          * execve(). */
4798
4799         r = close_all_fds(keep_fds, n_keep_fds);
4800         if (r >= 0)
4801                 r = shift_fds(fds, n_fds);
4802         if (r >= 0)
4803                 r = flags_fds(fds, n_socket_fds, n_fds, context->non_blocking);
4804         if (r < 0) {
4805                 *exit_status = EXIT_FDS;
4806                 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
4807         }
4808
4809         /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4810          * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4811          * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4812          * came this far. */
4813
4814         secure_bits = context->secure_bits;
4815
4816         if (needs_sandboxing) {
4817                 uint64_t bset;
4818
4819                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested.
4820                  * (Note this is placed after the general resource limit initialization, see above, in order
4821                  * to take precedence.) */
4822                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
4823                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
4824                                 *exit_status = EXIT_LIMITS;
4825                                 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
4826                         }
4827                 }
4828
4829 #if ENABLE_SMACK
4830                 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4831                  * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4832                 if (use_smack) {
4833                         r = setup_smack(unit->manager, context, executable_fd);
4834                         if (r < 0 && !context->smack_process_label_ignore) {
4835                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
4836                                 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
4837                         }
4838                 }
4839 #endif
4840
4841                 bset = context->capability_bounding_set;
4842                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4843                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4844                  * instead of us doing that */
4845                 if (needs_ambient_hack)
4846                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
4847                                 (UINT64_C(1) << CAP_SETUID) |
4848                                 (UINT64_C(1) << CAP_SETGID);
4849
4850                 if (!cap_test_all(bset)) {
4851                         r = capability_bounding_set_drop(bset, /* right_now= */ false);
4852                         if (r < 0) {
4853                                 *exit_status = EXIT_CAPABILITIES;
4854                                 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
4855                         }
4856                 }
4857
4858                 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4859                  * keep-caps set.
4860                  *
4861                  * To be able to raise the ambient capabilities after setresuid() they have to be added to
4862                  * the inherited set and keep caps has to be set (done in enforce_user()).  After setresuid()
4863                  * the ambient capabilities can be raised as they are present in the permitted and
4864                  * inhertiable set. However it is possible that someone wants to set ambient capabilities
4865                  * without changing the user, so we also set the ambient capabilities here.
4866                  *
4867                  * The requested ambient capabilities are raised in the inheritable set if the second
4868                  * argument is true. */
4869                 if (!needs_ambient_hack) {
4870                         r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ true);
4871                         if (r < 0) {
4872                                 *exit_status = EXIT_CAPABILITIES;
4873                                 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
4874                         }
4875                 }
4876         }
4877
4878         /* chroot to root directory first, before we lose the ability to chroot */
4879         r = apply_root_directory(context, params, runtime, needs_mount_namespace, exit_status);
4880         if (r < 0)
4881                 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
4882
4883         if (needs_setuid) {
4884                 if (uid_is_valid(uid)) {
4885                         r = enforce_user(context, uid, capability_ambient_set);
4886                         if (r < 0) {
4887                                 *exit_status = EXIT_USER;
4888                                 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
4889                         }
4890
4891                         if (!needs_ambient_hack && capability_ambient_set != 0) {
4892
4893                                 /* Raise the ambient capabilities after user change. */
4894                                 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ false);
4895                                 if (r < 0) {
4896                                         *exit_status = EXIT_CAPABILITIES;
4897                                         return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
4898                                 }
4899                         }
4900                 }
4901         }
4902
4903         /* Apply working directory here, because the working directory might be on NFS and only the user running
4904          * this service might have the correct privilege to change to the working directory */
4905         r = apply_working_directory(context, params, runtime, home, exit_status);
4906         if (r < 0)
4907                 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
4908
4909         if (needs_sandboxing) {
4910                 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
4911                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
4912                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
4913                  * are restricted. */
4914
4915 #if HAVE_SELINUX
4916                 if (use_selinux) {
4917                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
4918
4919                         if (exec_context) {
4920                                 r = setexeccon(exec_context);
4921                                 if (r < 0) {
4922                                         if (!context->selinux_context_ignore) {
4923                                                 *exit_status = EXIT_SELINUX_CONTEXT;
4924                                                 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
4925                                         }
4926                                         log_unit_debug_errno(unit, r, "Failed to change SELinux context to %s, ignoring: %m", exec_context);
4927                                 }
4928                         }
4929                 }
4930 #endif
4931
4932 #if HAVE_APPARMOR
4933                 if (use_apparmor && context->apparmor_profile) {
4934                         r = aa_change_onexec(context->apparmor_profile);
4935                         if (r < 0 && !context->apparmor_profile_ignore) {
4936                                 *exit_status = EXIT_APPARMOR_PROFILE;
4937                                 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
4938                         }
4939                 }
4940 #endif
4941
4942                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential
4943                  * EPERMs we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits
4944                  * requires CAP_SETPCAP. */
4945                 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
4946                         /* CAP_SETPCAP is required to set securebits. This capability is raised into the
4947                          * effective set here.
4948                          *
4949                          * The effective set is overwritten during execve() with the following values:
4950                          *
4951                          * - ambient set (for non-root processes)
4952                          *
4953                          * - (inheritable | bounding) set for root processes)
4954                          *
4955                          * Hence there is no security impact to raise it in the effective set before execve
4956                          */
4957                         r = capability_gain_cap_setpcap(/* return_caps= */ NULL);
4958                         if (r < 0) {
4959                                 *exit_status = EXIT_CAPABILITIES;
4960                                 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
4961                         }
4962                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
4963                                 *exit_status = EXIT_SECUREBITS;
4964                                 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
4965                         }
4966                 }
4967
4968                 if (context_has_no_new_privileges(context))
4969                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
4970                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
4971                                 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
4972                         }
4973
4974 #if HAVE_SECCOMP
4975                 r = apply_address_families(unit, context);
4976                 if (r < 0) {
4977                         *exit_status = EXIT_ADDRESS_FAMILIES;
4978                         return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
4979                 }
4980
4981                 r = apply_memory_deny_write_execute(unit, context);
4982                 if (r < 0) {
4983                         *exit_status = EXIT_SECCOMP;
4984                         return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
4985                 }
4986
4987                 r = apply_restrict_realtime(unit, context);
4988                 if (r < 0) {
4989                         *exit_status = EXIT_SECCOMP;
4990                         return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
4991                 }
4992
4993                 r = apply_restrict_suid_sgid(unit, context);
4994                 if (r < 0) {
4995                         *exit_status = EXIT_SECCOMP;
4996                         return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
4997                 }
4998
4999                 r = apply_restrict_namespaces(unit, context);
5000                 if (r < 0) {
5001                         *exit_status = EXIT_SECCOMP;
5002                         return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
5003                 }
5004
5005                 r = apply_protect_sysctl(unit, context);
5006                 if (r < 0) {
5007                         *exit_status = EXIT_SECCOMP;
5008                         return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
5009                 }
5010
5011                 r = apply_protect_kernel_modules(unit, context);
5012                 if (r < 0) {
5013                         *exit_status = EXIT_SECCOMP;
5014                         return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
5015                 }
5016
5017                 r = apply_protect_kernel_logs(unit, context);
5018                 if (r < 0) {
5019                         *exit_status = EXIT_SECCOMP;
5020                         return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
5021                 }
5022
5023                 r = apply_protect_clock(unit, context);
5024                 if (r < 0) {
5025                         *exit_status = EXIT_SECCOMP;
5026                         return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
5027                 }
5028
5029                 r = apply_private_devices(unit, context);
5030                 if (r < 0) {
5031                         *exit_status = EXIT_SECCOMP;
5032                         return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
5033                 }
5034
5035                 r = apply_syscall_archs(unit, context);
5036                 if (r < 0) {
5037                         *exit_status = EXIT_SECCOMP;
5038                         return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
5039                 }
5040
5041                 r = apply_lock_personality(unit, context);
5042                 if (r < 0) {
5043                         *exit_status = EXIT_SECCOMP;
5044                         return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
5045                 }
5046
5047                 r = apply_syscall_log(unit, context);
5048                 if (r < 0) {
5049                         *exit_status = EXIT_SECCOMP;
5050                         return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
5051                 }
5052
5053                 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
5054                  * by the filter as little as possible. */
5055                 r = apply_syscall_filter(unit, context, needs_ambient_hack);
5056                 if (r < 0) {
5057                         *exit_status = EXIT_SECCOMP;
5058                         return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
5059                 }
5060 #endif
5061
5062 #if HAVE_LIBBPF
5063                 r = apply_restrict_filesystems(unit, context);
5064                 if (r < 0) {
5065                         *exit_status = EXIT_BPF;
5066                         return log_unit_error_errno(unit, r, "Failed to restrict filesystems: %m");
5067                 }
5068 #endif
5069
5070         }
5071
5072         if (!strv_isempty(context->unset_environment)) {
5073                 char **ee = NULL;
5074
5075                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
5076                 if (!ee) {
5077                         *exit_status = EXIT_MEMORY;
5078                         return log_oom();
5079                 }
5080
5081                 strv_free_and_replace(accum_env, ee);
5082         }
5083
5084         if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
5085                 _cleanup_strv_free_ char **unset_variables = NULL, **bad_variables = NULL;
5086
5087                 r = replace_env_argv(command->argv, accum_env, &replaced_argv, &unset_variables, &bad_variables);
5088                 if (r < 0) {
5089                         *exit_status = EXIT_MEMORY;
5090                         return log_unit_error_errno(unit, r, "Failed to replace environment variables: %m");
5091                 }
5092                 final_argv = replaced_argv;
5093
5094                 if (!strv_isempty(unset_variables)) {
5095                         _cleanup_free_ char *ju = strv_join(unset_variables, ", ");
5096                         log_unit_warning(unit, "Referenced but unset environment variable evaluates to an empty string: %s", strna(ju));
5097                 }
5098
5099                 if (!strv_isempty(bad_variables)) {
5100                         _cleanup_free_ char *jb = strv_join(bad_variables, ", ");
5101                         log_unit_warning(unit, "Invalid environment variable name evaluates to an empty string: %s", strna(jb));;
5102                 }
5103         } else
5104                 final_argv = command->argv;
5105
5106         log_command_line(unit, "Executing", executable, final_argv);
5107
5108         if (exec_fd >= 0) {
5109                 uint8_t hot = 1;
5110
5111                 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5112                  * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5113
5114                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5115                         *exit_status = EXIT_EXEC;
5116                         return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
5117                 }
5118         }
5119
5120         r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5121
5122         if (exec_fd >= 0) {
5123                 uint8_t hot = 0;
5124
5125                 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5126                  * that POLLHUP on it no longer means execve() succeeded. */
5127
5128                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5129                         *exit_status = EXIT_EXEC;
5130                         return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
5131                 }
5132         }
5133
5134         *exit_status = EXIT_EXEC;
5135         return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
5136 }
5137
5138 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
5139 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
5140
5141 int exec_spawn(Unit *unit,
5142                ExecCommand *command,
5143                const ExecContext *context,
5144                const ExecParameters *params,
5145                ExecRuntime *runtime,
5146                const CGroupContext *cgroup_context,
5147                pid_t *ret) {
5148
5149         int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
5150         _cleanup_free_ char *subcgroup_path = NULL;
5151         _cleanup_strv_free_ char **files_env = NULL;
5152         size_t n_storage_fds = 0, n_socket_fds = 0;
5153         pid_t pid;
5154
5155         assert(unit);
5156         assert(command);
5157         assert(context);
5158         assert(ret);
5159         assert(params);
5160         assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
5161
5162         LOG_CONTEXT_PUSH_UNIT(unit);
5163
5164         if (context->std_input == EXEC_INPUT_SOCKET ||
5165             context->std_output == EXEC_OUTPUT_SOCKET ||
5166             context->std_error == EXEC_OUTPUT_SOCKET) {
5167
5168                 if (params->n_socket_fds > 1)
5169                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
5170
5171                 if (params->n_socket_fds == 0)
5172                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
5173
5174                 socket_fd = params->fds[0];
5175         } else {
5176                 socket_fd = -EBADF;
5177                 fds = params->fds;
5178                 n_socket_fds = params->n_socket_fds;
5179                 n_storage_fds = params->n_storage_fds;
5180         }
5181
5182         r = exec_context_named_iofds(context, params, named_iofds);
5183         if (r < 0)
5184                 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
5185
5186         r = exec_context_load_environment(unit, context, &files_env);
5187         if (r < 0)
5188                 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
5189
5190         /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
5191            and, until the next SELinux policy changes, we save further reloads in future children. */
5192         mac_selinux_maybe_reload();
5193
5194         /* We won't know the real executable path until we create the mount namespace in the child, but we
5195            want to log from the parent, so we use the possibly inaccurate path here. */
5196         log_command_line(unit, "About to execute", command->path, command->argv);
5197
5198         if (params->cgroup_path) {
5199                 r = exec_parameters_get_cgroup_path(params, cgroup_context, &subcgroup_path);
5200                 if (r < 0)
5201                         return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
5202                 if (r > 0) {
5203                         /* If there's a subcgroup, then let's create it here now (the main cgroup was already
5204                          * realized by the unit logic) */
5205
5206                         r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
5207                         if (r < 0)
5208                                 return log_unit_error_errno(unit, r, "Failed to create subcgroup '%s': %m", subcgroup_path);
5209                 }
5210         }
5211
5212         pid = fork();
5213         if (pid < 0)
5214                 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
5215
5216         if (pid == 0) {
5217                 int exit_status;
5218
5219                 r = exec_child(unit,
5220                                command,
5221                                context,
5222                                params,
5223                                runtime,
5224                                cgroup_context,
5225                                socket_fd,
5226                                named_iofds,
5227                                fds,
5228                                n_socket_fds,
5229                                n_storage_fds,
5230                                files_env,
5231                                unit->manager->user_lookup_fds[1],
5232                                &exit_status);
5233
5234                 if (r < 0) {
5235                         const char *status = ASSERT_PTR(
5236                                         exit_status_to_string(exit_status, EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD));
5237
5238                         log_unit_struct_errno(unit, LOG_ERR, r,
5239                                               "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5240                                               LOG_UNIT_INVOCATION_ID(unit),
5241                                               LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
5242                                                                status, command->path),
5243                                               "EXECUTABLE=%s", command->path);
5244                 } else
5245                         assert(exit_status == EXIT_SUCCESS);
5246
5247                 _exit(exit_status);
5248         }
5249
5250         log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
5251
5252         /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
5253          * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
5254          * process will be killed too). */
5255         if (subcgroup_path)
5256                 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
5257
5258         exec_status_start(&command->exec_status, pid);
5259
5260         *ret = pid;
5261         return 0;
5262 }
5263
5264 void exec_context_init(ExecContext *c) {
5265         assert(c);
5266
5267         *c = (ExecContext) {
5268                 .umask = 0022,
5269                 .ioprio = IOPRIO_DEFAULT_CLASS_AND_PRIO,
5270                 .cpu_sched_policy = SCHED_OTHER,
5271                 .syslog_priority = LOG_DAEMON|LOG_INFO,
5272                 .syslog_level_prefix = true,
5273                 .ignore_sigpipe = true,
5274                 .timer_slack_nsec = NSEC_INFINITY,
5275                 .personality = PERSONALITY_INVALID,
5276                 .timeout_clean_usec = USEC_INFINITY,
5277                 .capability_bounding_set = CAP_MASK_UNSET,
5278                 .restrict_namespaces = NAMESPACE_FLAGS_INITIAL,
5279                 .log_level_max = -1,
5280 #if HAVE_SECCOMP
5281                 .syscall_errno = SECCOMP_ERROR_NUMBER_KILL,
5282 #endif
5283                 .tty_rows = UINT_MAX,
5284                 .tty_cols = UINT_MAX,
5285                 .private_mounts = -1,
5286                 .memory_ksm = -1,
5287                 .set_login_environment = -1,
5288         };
5289
5290         FOREACH_ARRAY(d, c->directories, _EXEC_DIRECTORY_TYPE_MAX)
5291                 d->mode = 0755;
5292
5293         numa_policy_reset(&c->numa_policy);
5294
5295         assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
5296 }
5297
5298 void exec_context_done(ExecContext *c) {
5299         assert(c);
5300
5301         c->environment = strv_free(c->environment);
5302         c->environment_files = strv_free(c->environment_files);
5303         c->pass_environment = strv_free(c->pass_environment);
5304         c->unset_environment = strv_free(c->unset_environment);
5305
5306         rlimit_free_all(c->rlimit);
5307
5308         for (size_t l = 0; l < 3; l++) {
5309                 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
5310                 c->stdio_file[l] = mfree(c->stdio_file[l]);
5311         }
5312
5313         c->working_directory = mfree(c->working_directory);
5314         c->root_directory = mfree(c->root_directory);
5315         c->root_image = mfree(c->root_image);
5316         c->root_image_options = mount_options_free_all(c->root_image_options);
5317         c->root_hash = mfree(c->root_hash);
5318         c->root_hash_size = 0;
5319         c->root_hash_path = mfree(c->root_hash_path);
5320         c->root_hash_sig = mfree(c->root_hash_sig);
5321         c->root_hash_sig_size = 0;
5322         c->root_hash_sig_path = mfree(c->root_hash_sig_path);
5323         c->root_verity = mfree(c->root_verity);
5324         c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
5325         c->extension_directories = strv_free(c->extension_directories);
5326         c->tty_path = mfree(c->tty_path);
5327         c->syslog_identifier = mfree(c->syslog_identifier);
5328         c->user = mfree(c->user);
5329         c->group = mfree(c->group);
5330
5331         c->supplementary_groups = strv_free(c->supplementary_groups);
5332
5333         c->pam_name = mfree(c->pam_name);
5334
5335         c->read_only_paths = strv_free(c->read_only_paths);
5336         c->read_write_paths = strv_free(c->read_write_paths);
5337         c->inaccessible_paths = strv_free(c->inaccessible_paths);
5338         c->exec_paths = strv_free(c->exec_paths);
5339         c->no_exec_paths = strv_free(c->no_exec_paths);
5340         c->exec_search_path = strv_free(c->exec_search_path);
5341
5342         bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
5343         c->bind_mounts = NULL;
5344         c->n_bind_mounts = 0;
5345         temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
5346         c->temporary_filesystems = NULL;
5347         c->n_temporary_filesystems = 0;
5348         c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
5349
5350         cpu_set_reset(&c->cpu_set);
5351         numa_policy_reset(&c->numa_policy);
5352
5353         c->utmp_id = mfree(c->utmp_id);
5354         c->selinux_context = mfree(c->selinux_context);
5355         c->apparmor_profile = mfree(c->apparmor_profile);
5356         c->smack_process_label = mfree(c->smack_process_label);
5357
5358         c->restrict_filesystems = set_free_free(c->restrict_filesystems);
5359
5360         c->syscall_filter = hashmap_free(c->syscall_filter);
5361         c->syscall_archs = set_free(c->syscall_archs);
5362         c->address_families = set_free(c->address_families);
5363
5364         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5365                 exec_directory_done(&c->directories[t]);
5366
5367         c->log_level_max = -1;
5368
5369         exec_context_free_log_extra_fields(c);
5370         c->log_filter_allowed_patterns = set_free_free(c->log_filter_allowed_patterns);
5371         c->log_filter_denied_patterns = set_free_free(c->log_filter_denied_patterns);
5372
5373         c->log_ratelimit_interval_usec = 0;
5374         c->log_ratelimit_burst = 0;
5375
5376         c->stdin_data = mfree(c->stdin_data);
5377         c->stdin_data_size = 0;
5378
5379         c->network_namespace_path = mfree(c->network_namespace_path);
5380         c->ipc_namespace_path = mfree(c->ipc_namespace_path);
5381
5382         c->log_namespace = mfree(c->log_namespace);
5383
5384         c->load_credentials = hashmap_free(c->load_credentials);
5385         c->set_credentials = hashmap_free(c->set_credentials);
5386         c->import_credentials = set_free_free(c->import_credentials);
5387
5388         c->root_image_policy = image_policy_free(c->root_image_policy);
5389         c->mount_image_policy = image_policy_free(c->mount_image_policy);
5390         c->extension_image_policy = image_policy_free(c->extension_image_policy);
5391 }
5392
5393 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
5394         assert(c);
5395
5396         if (!runtime_prefix)
5397                 return 0;
5398
5399         for (size_t i = 0; i < c->directories[EXEC_DIRECTORY_RUNTIME].n_items; i++) {
5400                 _cleanup_free_ char *p = NULL;
5401
5402                 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5403                         p = path_join(runtime_prefix, "private", c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
5404                 else
5405                         p = path_join(runtime_prefix, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
5406                 if (!p)
5407                         return -ENOMEM;
5408
5409                 /* We execute this synchronously, since we need to be sure this is gone when we start the
5410                  * service next. */
5411                 (void) rm_rf(p, REMOVE_ROOT);
5412
5413                 STRV_FOREACH(symlink, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].symlinks) {
5414                         _cleanup_free_ char *symlink_abs = NULL;
5415
5416                         if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5417                                 symlink_abs = path_join(runtime_prefix, "private", *symlink);
5418                         else
5419                                 symlink_abs = path_join(runtime_prefix, *symlink);
5420                         if (!symlink_abs)
5421                                 return -ENOMEM;
5422
5423                         (void) unlink(symlink_abs);
5424                 }
5425         }
5426
5427         return 0;
5428 }
5429
5430 int exec_context_destroy_mount_ns_dir(Unit *u) {
5431         _cleanup_free_ char *p = NULL;
5432
5433         if (!u || !MANAGER_IS_SYSTEM(u->manager))
5434                 return 0;
5435
5436         p = path_join("/run/systemd/propagate/", u->id);
5437         if (!p)
5438                 return -ENOMEM;
5439
5440         /* This is only filled transiently (see mount_in_namespace()), should be empty or even non-existent*/
5441         if (rmdir(p) < 0 && errno != ENOENT)
5442                 log_unit_debug_errno(u, errno, "Unable to remove propagation dir '%s', ignoring: %m", p);
5443
5444         return 0;
5445 }
5446
5447 static void exec_command_done(ExecCommand *c) {
5448         assert(c);
5449
5450         c->path = mfree(c->path);
5451         c->argv = strv_free(c->argv);
5452 }
5453
5454 void exec_command_done_array(ExecCommand *c, size_t n) {
5455         for (size_t i = 0; i < n; i++)
5456                 exec_command_done(c+i);
5457 }
5458
5459 ExecCommand* exec_command_free_list(ExecCommand *c) {
5460         ExecCommand *i;
5461
5462         while ((i = LIST_POP(command, c))) {
5463                 exec_command_done(i);
5464                 free(i);
5465         }
5466
5467         return NULL;
5468 }
5469
5470 void exec_command_free_array(ExecCommand **c, size_t n) {
5471         for (size_t i = 0; i < n; i++)
5472                 c[i] = exec_command_free_list(c[i]);
5473 }
5474
5475 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
5476         for (size_t i = 0; i < n; i++)
5477                 exec_status_reset(&c[i].exec_status);
5478 }
5479
5480 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
5481         for (size_t i = 0; i < n; i++)
5482                 LIST_FOREACH(command, z, c[i])
5483                         exec_status_reset(&z->exec_status);
5484 }
5485
5486 typedef struct InvalidEnvInfo {
5487         const Unit *unit;
5488         const char *path;
5489 } InvalidEnvInfo;
5490
5491 static void invalid_env(const char *p, void *userdata) {
5492         InvalidEnvInfo *info = userdata;
5493
5494         log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
5495 }
5496
5497 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
5498         assert(c);
5499
5500         switch (fd_index) {
5501
5502         case STDIN_FILENO:
5503                 if (c->std_input != EXEC_INPUT_NAMED_FD)
5504                         return NULL;
5505
5506                 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
5507
5508         case STDOUT_FILENO:
5509                 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
5510                         return NULL;
5511
5512                 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
5513
5514         case STDERR_FILENO:
5515                 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
5516                         return NULL;
5517
5518                 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
5519
5520         default:
5521                 return NULL;
5522         }
5523 }
5524
5525 static int exec_context_named_iofds(
5526                 const ExecContext *c,
5527                 const ExecParameters *p,
5528                 int named_iofds[static 3]) {
5529
5530         size_t targets;
5531         const char* stdio_fdname[3];
5532         size_t n_fds;
5533
5534         assert(c);
5535         assert(p);
5536         assert(named_iofds);
5537
5538         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
5539                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
5540                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
5541
5542         for (size_t i = 0; i < 3; i++)
5543                 stdio_fdname[i] = exec_context_fdname(c, i);
5544
5545         n_fds = p->n_storage_fds + p->n_socket_fds;
5546
5547         for (size_t i = 0; i < n_fds  && targets > 0; i++)
5548                 if (named_iofds[STDIN_FILENO] < 0 &&
5549                     c->std_input == EXEC_INPUT_NAMED_FD &&
5550                     stdio_fdname[STDIN_FILENO] &&
5551                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
5552
5553                         named_iofds[STDIN_FILENO] = p->fds[i];
5554                         targets--;
5555
5556                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
5557                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
5558                            stdio_fdname[STDOUT_FILENO] &&
5559                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
5560
5561                         named_iofds[STDOUT_FILENO] = p->fds[i];
5562                         targets--;
5563
5564                 } else if (named_iofds[STDERR_FILENO] < 0 &&
5565                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
5566                            stdio_fdname[STDERR_FILENO] &&
5567                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
5568
5569                         named_iofds[STDERR_FILENO] = p->fds[i];
5570                         targets--;
5571                 }
5572
5573         return targets == 0 ? 0 : -ENOENT;
5574 }
5575
5576 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***ret) {
5577         _cleanup_strv_free_ char **v = NULL;
5578         int r;
5579
5580         assert(c);
5581         assert(ret);
5582
5583         STRV_FOREACH(i, c->environment_files) {
5584                 _cleanup_globfree_ glob_t pglob = {};
5585                 bool ignore = false;
5586                 char *fn = *i;
5587
5588                 if (fn[0] == '-') {
5589                         ignore = true;
5590                         fn++;
5591                 }
5592
5593                 if (!path_is_absolute(fn)) {
5594                         if (ignore)
5595                                 continue;
5596                         return -EINVAL;
5597                 }
5598
5599                 /* Filename supports globbing, take all matching files */
5600                 r = safe_glob(fn, 0, &pglob);
5601                 if (r < 0) {
5602                         if (ignore)
5603                                 continue;
5604                         return r;
5605                 }
5606
5607                 /* When we don't match anything, -ENOENT should be returned */
5608                 assert(pglob.gl_pathc > 0);
5609
5610                 for (size_t n = 0; n < pglob.gl_pathc; n++) {
5611                         _cleanup_strv_free_ char **p = NULL;
5612
5613                         r = load_env_file(NULL, pglob.gl_pathv[n], &p);
5614                         if (r < 0) {
5615                                 if (ignore)
5616                                         continue;
5617                                 return r;
5618                         }
5619
5620                         /* Log invalid environment variables with filename */
5621                         if (p) {
5622                                 InvalidEnvInfo info = {
5623                                         .unit = unit,
5624                                         .path = pglob.gl_pathv[n]
5625                                 };
5626
5627                                 p = strv_env_clean_with_callback(p, invalid_env, &info);
5628                         }
5629
5630                         if (!v)
5631                                 v = TAKE_PTR(p);
5632                         else {
5633                                 char **m = strv_env_merge(v, p);
5634                                 if (!m)
5635                                         return -ENOMEM;
5636
5637                                 strv_free_and_replace(v, m);
5638                         }
5639                 }
5640         }
5641
5642         *ret = TAKE_PTR(v);
5643
5644         return 0;
5645 }
5646
5647 static bool tty_may_match_dev_console(const char *tty) {
5648         _cleanup_free_ char *resolved = NULL;
5649
5650         if (!tty)
5651                 return true;
5652
5653         tty = skip_dev_prefix(tty);
5654
5655         /* trivial identity? */
5656         if (streq(tty, "console"))
5657                 return true;
5658
5659         if (resolve_dev_console(&resolved) < 0)
5660                 return true; /* if we could not resolve, assume it may */
5661
5662         /* "tty0" means the active VC, so it may be the same sometimes */
5663         return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
5664 }
5665
5666 static bool exec_context_may_touch_tty(const ExecContext *ec) {
5667         assert(ec);
5668
5669         return ec->tty_reset ||
5670                 ec->tty_vhangup ||
5671                 ec->tty_vt_disallocate ||
5672                 is_terminal_input(ec->std_input) ||
5673                 is_terminal_output(ec->std_output) ||
5674                 is_terminal_output(ec->std_error);
5675 }
5676
5677 bool exec_context_may_touch_console(const ExecContext *ec) {
5678
5679         return exec_context_may_touch_tty(ec) &&
5680                tty_may_match_dev_console(exec_context_tty_path(ec));
5681 }
5682
5683 static void strv_fprintf(FILE *f, char **l) {
5684         assert(f);
5685
5686         STRV_FOREACH(g, l)
5687                 fprintf(f, " %s", *g);
5688 }
5689
5690 static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
5691         assert(f);
5692         assert(prefix);
5693         assert(name);
5694
5695         if (!strv_isempty(strv)) {
5696                 fprintf(f, "%s%s:", prefix, name);
5697                 strv_fprintf(f, strv);
5698                 fputs("\n", f);
5699         }
5700 }
5701
5702 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
5703         int r;
5704
5705         assert(c);
5706         assert(f);
5707
5708         prefix = strempty(prefix);
5709
5710         fprintf(f,
5711                 "%sUMask: %04o\n"
5712                 "%sWorkingDirectory: %s\n"
5713                 "%sRootDirectory: %s\n"
5714                 "%sRootEphemeral: %s\n"
5715                 "%sNonBlocking: %s\n"
5716                 "%sPrivateTmp: %s\n"
5717                 "%sPrivateDevices: %s\n"
5718                 "%sProtectKernelTunables: %s\n"
5719                 "%sProtectKernelModules: %s\n"
5720                 "%sProtectKernelLogs: %s\n"
5721                 "%sProtectClock: %s\n"
5722                 "%sProtectControlGroups: %s\n"
5723                 "%sPrivateNetwork: %s\n"
5724                 "%sPrivateUsers: %s\n"
5725                 "%sProtectHome: %s\n"
5726                 "%sProtectSystem: %s\n"
5727                 "%sMountAPIVFS: %s\n"
5728                 "%sIgnoreSIGPIPE: %s\n"
5729                 "%sMemoryDenyWriteExecute: %s\n"
5730                 "%sRestrictRealtime: %s\n"
5731                 "%sRestrictSUIDSGID: %s\n"
5732                 "%sKeyringMode: %s\n"
5733                 "%sProtectHostname: %s\n"
5734                 "%sProtectProc: %s\n"
5735                 "%sProcSubset: %s\n",
5736                 prefix, c->umask,
5737                 prefix, empty_to_root(c->working_directory),
5738                 prefix, empty_to_root(c->root_directory),
5739                 prefix, yes_no(c->root_ephemeral),
5740                 prefix, yes_no(c->non_blocking),
5741                 prefix, yes_no(c->private_tmp),
5742                 prefix, yes_no(c->private_devices),
5743                 prefix, yes_no(c->protect_kernel_tunables),
5744                 prefix, yes_no(c->protect_kernel_modules),
5745                 prefix, yes_no(c->protect_kernel_logs),
5746                 prefix, yes_no(c->protect_clock),
5747                 prefix, yes_no(c->protect_control_groups),
5748                 prefix, yes_no(c->private_network),
5749                 prefix, yes_no(c->private_users),
5750                 prefix, protect_home_to_string(c->protect_home),
5751                 prefix, protect_system_to_string(c->protect_system),
5752                 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
5753                 prefix, yes_no(c->ignore_sigpipe),
5754                 prefix, yes_no(c->memory_deny_write_execute),
5755                 prefix, yes_no(c->restrict_realtime),
5756                 prefix, yes_no(c->restrict_suid_sgid),
5757                 prefix, exec_keyring_mode_to_string(c->keyring_mode),
5758                 prefix, yes_no(c->protect_hostname),
5759                 prefix, protect_proc_to_string(c->protect_proc),
5760                 prefix, proc_subset_to_string(c->proc_subset));
5761
5762         if (c->root_image)
5763                 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
5764
5765         if (c->root_image_options) {
5766                 fprintf(f, "%sRootImageOptions:", prefix);
5767                 LIST_FOREACH(mount_options, o, c->root_image_options)
5768                         if (!isempty(o->options))
5769                                 fprintf(f, " %s:%s",
5770                                         partition_designator_to_string(o->partition_designator),
5771                                         o->options);
5772                 fprintf(f, "\n");
5773         }
5774
5775         if (c->root_hash) {
5776                 _cleanup_free_ char *encoded = NULL;
5777                 encoded = hexmem(c->root_hash, c->root_hash_size);
5778                 if (encoded)
5779                         fprintf(f, "%sRootHash: %s\n", prefix, encoded);
5780         }
5781
5782         if (c->root_hash_path)
5783                 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
5784
5785         if (c->root_hash_sig) {
5786                 _cleanup_free_ char *encoded = NULL;
5787                 ssize_t len;
5788                 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
5789                 if (len)
5790                         fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
5791         }
5792
5793         if (c->root_hash_sig_path)
5794                 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
5795
5796         if (c->root_verity)
5797                 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
5798
5799         STRV_FOREACH(e, c->environment)
5800                 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
5801
5802         STRV_FOREACH(e, c->environment_files)
5803                 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
5804
5805         STRV_FOREACH(e, c->pass_environment)
5806                 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
5807
5808         STRV_FOREACH(e, c->unset_environment)
5809                 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
5810
5811         fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
5812
5813         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
5814                 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
5815
5816                 for (size_t i = 0; i < c->directories[dt].n_items; i++) {
5817                         fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].items[i].path);
5818
5819                         STRV_FOREACH(d, c->directories[dt].items[i].symlinks)
5820                                 fprintf(f, "%s%s: %s:%s\n", prefix, exec_directory_type_symlink_to_string(dt), c->directories[dt].items[i].path, *d);
5821                 }
5822         }
5823
5824         fprintf(f, "%sTimeoutCleanSec: %s\n", prefix, FORMAT_TIMESPAN(c->timeout_clean_usec, USEC_PER_SEC));
5825
5826         if (c->nice_set)
5827                 fprintf(f, "%sNice: %i\n", prefix, c->nice);
5828
5829         if (c->oom_score_adjust_set)
5830                 fprintf(f, "%sOOMScoreAdjust: %i\n", prefix, c->oom_score_adjust);
5831
5832         if (c->coredump_filter_set)
5833                 fprintf(f, "%sCoredumpFilter: 0x%"PRIx64"\n", prefix, c->coredump_filter);
5834
5835         for (unsigned i = 0; i < RLIM_NLIMITS; i++)
5836                 if (c->rlimit[i]) {
5837                         fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
5838                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
5839                         fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
5840                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
5841                 }
5842
5843         if (c->ioprio_set) {
5844                 _cleanup_free_ char *class_str = NULL;
5845
5846                 r = ioprio_class_to_string_alloc(ioprio_prio_class(c->ioprio), &class_str);
5847                 if (r >= 0)
5848                         fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
5849
5850                 fprintf(f, "%sIOPriority: %d\n", prefix, ioprio_prio_data(c->ioprio));
5851         }
5852
5853         if (c->cpu_sched_set) {
5854                 _cleanup_free_ char *policy_str = NULL;
5855
5856                 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
5857                 if (r >= 0)
5858                         fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
5859
5860                 fprintf(f,
5861                         "%sCPUSchedulingPriority: %i\n"
5862                         "%sCPUSchedulingResetOnFork: %s\n",
5863                         prefix, c->cpu_sched_priority,
5864                         prefix, yes_no(c->cpu_sched_reset_on_fork));
5865         }
5866
5867         if (c->cpu_set.set) {
5868                 _cleanup_free_ char *affinity = NULL;
5869
5870                 affinity = cpu_set_to_range_string(&c->cpu_set);
5871                 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
5872         }
5873
5874         if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
5875                 _cleanup_free_ char *nodes = NULL;
5876
5877                 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
5878                 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
5879                 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
5880         }
5881
5882         if (c->timer_slack_nsec != NSEC_INFINITY)
5883                 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
5884
5885         fprintf(f,
5886                 "%sStandardInput: %s\n"
5887                 "%sStandardOutput: %s\n"
5888                 "%sStandardError: %s\n",
5889                 prefix, exec_input_to_string(c->std_input),
5890                 prefix, exec_output_to_string(c->std_output),
5891                 prefix, exec_output_to_string(c->std_error));
5892
5893         if (c->std_input == EXEC_INPUT_NAMED_FD)
5894                 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
5895         if (c->std_output == EXEC_OUTPUT_NAMED_FD)
5896                 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
5897         if (c->std_error == EXEC_OUTPUT_NAMED_FD)
5898                 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
5899
5900         if (c->std_input == EXEC_INPUT_FILE)
5901                 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
5902         if (c->std_output == EXEC_OUTPUT_FILE)
5903                 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5904         if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
5905                 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5906         if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
5907                 fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5908         if (c->std_error == EXEC_OUTPUT_FILE)
5909                 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5910         if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
5911                 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5912         if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
5913                 fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5914
5915         if (c->tty_path)
5916                 fprintf(f,
5917                         "%sTTYPath: %s\n"
5918                         "%sTTYReset: %s\n"
5919                         "%sTTYVHangup: %s\n"
5920                         "%sTTYVTDisallocate: %s\n"
5921                         "%sTTYRows: %u\n"
5922                         "%sTTYColumns: %u\n",
5923                         prefix, c->tty_path,
5924                         prefix, yes_no(c->tty_reset),
5925                         prefix, yes_no(c->tty_vhangup),
5926                         prefix, yes_no(c->tty_vt_disallocate),
5927                         prefix, c->tty_rows,
5928                         prefix, c->tty_cols);
5929
5930         if (IN_SET(c->std_output,
5931                    EXEC_OUTPUT_KMSG,
5932                    EXEC_OUTPUT_JOURNAL,
5933                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
5934                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
5935             IN_SET(c->std_error,
5936                    EXEC_OUTPUT_KMSG,
5937                    EXEC_OUTPUT_JOURNAL,
5938                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
5939                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
5940
5941                 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
5942
5943                 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
5944                 if (r >= 0)
5945                         fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
5946
5947                 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
5948                 if (r >= 0)
5949                         fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
5950         }
5951
5952         if (c->log_level_max >= 0) {
5953                 _cleanup_free_ char *t = NULL;
5954
5955                 (void) log_level_to_string_alloc(c->log_level_max, &t);
5956
5957                 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
5958         }
5959
5960         if (c->log_ratelimit_interval_usec > 0)
5961                 fprintf(f,
5962                         "%sLogRateLimitIntervalSec: %s\n",
5963                         prefix, FORMAT_TIMESPAN(c->log_ratelimit_interval_usec, USEC_PER_SEC));
5964
5965         if (c->log_ratelimit_burst > 0)
5966                 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
5967
5968         if (!set_isempty(c->log_filter_allowed_patterns) || !set_isempty(c->log_filter_denied_patterns)) {
5969                 fprintf(f, "%sLogFilterPatterns:", prefix);
5970
5971                 char *pattern;
5972                 SET_FOREACH(pattern, c->log_filter_allowed_patterns)
5973                         fprintf(f, " %s", pattern);
5974                 SET_FOREACH(pattern, c->log_filter_denied_patterns)
5975                         fprintf(f, " ~%s", pattern);
5976                 fputc('\n', f);
5977         }
5978
5979         for (size_t j = 0; j < c->n_log_extra_fields; j++) {
5980                 fprintf(f, "%sLogExtraFields: ", prefix);
5981                 fwrite(c->log_extra_fields[j].iov_base,
5982                        1, c->log_extra_fields[j].iov_len,
5983                        f);
5984                 fputc('\n', f);
5985         }
5986
5987         if (c->log_namespace)
5988                 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
5989
5990         if (c->secure_bits) {
5991                 _cleanup_free_ char *str = NULL;
5992
5993                 r = secure_bits_to_string_alloc(c->secure_bits, &str);
5994                 if (r >= 0)
5995                         fprintf(f, "%sSecure Bits: %s\n", prefix, str);
5996         }
5997
5998         if (c->capability_bounding_set != CAP_MASK_UNSET) {
5999                 _cleanup_free_ char *str = NULL;
6000
6001                 r = capability_set_to_string(c->capability_bounding_set, &str);
6002                 if (r >= 0)
6003                         fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
6004         }
6005
6006         if (c->capability_ambient_set != 0) {
6007                 _cleanup_free_ char *str = NULL;
6008
6009                 r = capability_set_to_string(c->capability_ambient_set, &str);
6010                 if (r >= 0)
6011                         fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
6012         }
6013
6014         if (c->user)
6015                 fprintf(f, "%sUser: %s\n", prefix, c->user);
6016         if (c->group)
6017                 fprintf(f, "%sGroup: %s\n", prefix, c->group);
6018
6019         fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
6020
6021         strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
6022
6023         if (c->pam_name)
6024                 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
6025
6026         strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
6027         strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
6028         strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
6029         strv_dump(f, prefix, "ExecPaths", c->exec_paths);
6030         strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
6031         strv_dump(f, prefix, "ExecSearchPath", c->exec_search_path);
6032
6033         for (size_t i = 0; i < c->n_bind_mounts; i++)
6034                 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
6035                         c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
6036                         c->bind_mounts[i].ignore_enoent ? "-": "",
6037                         c->bind_mounts[i].source,
6038                         c->bind_mounts[i].destination,
6039                         c->bind_mounts[i].recursive ? "rbind" : "norbind");
6040
6041         for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
6042                 const TemporaryFileSystem *t = c->temporary_filesystems + i;
6043
6044                 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
6045                         t->path,
6046                         isempty(t->options) ? "" : ":",
6047                         strempty(t->options));
6048         }
6049
6050         if (c->utmp_id)
6051                 fprintf(f,
6052                         "%sUtmpIdentifier: %s\n",
6053                         prefix, c->utmp_id);
6054
6055         if (c->selinux_context)
6056                 fprintf(f,
6057                         "%sSELinuxContext: %s%s\n",
6058                         prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
6059
6060         if (c->apparmor_profile)
6061                 fprintf(f,
6062                         "%sAppArmorProfile: %s%s\n",
6063                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
6064
6065         if (c->smack_process_label)
6066                 fprintf(f,
6067                         "%sSmackProcessLabel: %s%s\n",
6068                         prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
6069
6070         if (c->personality != PERSONALITY_INVALID)
6071                 fprintf(f,
6072                         "%sPersonality: %s\n",
6073                         prefix, strna(personality_to_string(c->personality)));
6074
6075         fprintf(f,
6076                 "%sLockPersonality: %s\n",
6077                 prefix, yes_no(c->lock_personality));
6078
6079         if (c->syscall_filter) {
6080                 fprintf(f,
6081                         "%sSystemCallFilter: ",
6082                         prefix);
6083
6084                 if (!c->syscall_allow_list)
6085                         fputc('~', f);
6086
6087 #if HAVE_SECCOMP
6088                 void *id, *val;
6089                 bool first = true;
6090                 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
6091                         _cleanup_free_ char *name = NULL;
6092                         const char *errno_name = NULL;
6093                         int num = PTR_TO_INT(val);
6094
6095                         if (first)
6096                                 first = false;
6097                         else
6098                                 fputc(' ', f);
6099
6100                         name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
6101                         fputs(strna(name), f);
6102
6103                         if (num >= 0) {
6104                                 errno_name = seccomp_errno_or_action_to_string(num);
6105                                 if (errno_name)
6106                                         fprintf(f, ":%s", errno_name);
6107                                 else
6108                                         fprintf(f, ":%d", num);
6109                         }
6110                 }
6111 #endif
6112
6113                 fputc('\n', f);
6114         }
6115
6116         if (c->syscall_archs) {
6117                 fprintf(f,
6118                         "%sSystemCallArchitectures:",
6119                         prefix);
6120
6121 #if HAVE_SECCOMP
6122                 void *id;
6123                 SET_FOREACH(id, c->syscall_archs)
6124                         fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
6125 #endif
6126                 fputc('\n', f);
6127         }
6128
6129         if (exec_context_restrict_namespaces_set(c)) {
6130                 _cleanup_free_ char *s = NULL;
6131
6132                 r = namespace_flags_to_string(c->restrict_namespaces, &s);
6133                 if (r >= 0)
6134                         fprintf(f, "%sRestrictNamespaces: %s\n",
6135                                 prefix, strna(s));
6136         }
6137
6138 #if HAVE_LIBBPF
6139         if (exec_context_restrict_filesystems_set(c)) {
6140                 char *fs;
6141                 SET_FOREACH(fs, c->restrict_filesystems)
6142                         fprintf(f, "%sRestrictFileSystems: %s\n", prefix, fs);
6143         }
6144 #endif
6145
6146         if (c->network_namespace_path)
6147                 fprintf(f,
6148                         "%sNetworkNamespacePath: %s\n",
6149                         prefix, c->network_namespace_path);
6150
6151         if (c->syscall_errno > 0) {
6152                 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
6153
6154 #if HAVE_SECCOMP
6155                 const char *errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
6156                 if (errno_name)
6157                         fputs(errno_name, f);
6158                 else
6159                         fprintf(f, "%d", c->syscall_errno);
6160 #endif
6161                 fputc('\n', f);
6162         }
6163
6164         for (size_t i = 0; i < c->n_mount_images; i++) {
6165                 fprintf(f, "%sMountImages: %s%s:%s", prefix,
6166                         c->mount_images[i].ignore_enoent ? "-": "",
6167                         c->mount_images[i].source,
6168                         c->mount_images[i].destination);
6169                 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
6170                         fprintf(f, ":%s:%s",
6171                                 partition_designator_to_string(o->partition_designator),
6172                                 strempty(o->options));
6173                 fprintf(f, "\n");
6174         }
6175
6176         for (size_t i = 0; i < c->n_extension_images; i++) {
6177                 fprintf(f, "%sExtensionImages: %s%s", prefix,
6178                         c->extension_images[i].ignore_enoent ? "-": "",
6179                         c->extension_images[i].source);
6180                 LIST_FOREACH(mount_options, o, c->extension_images[i].mount_options)
6181                         fprintf(f, ":%s:%s",
6182                                 partition_designator_to_string(o->partition_designator),
6183                                 strempty(o->options));
6184                 fprintf(f, "\n");
6185         }
6186
6187         strv_dump(f, prefix, "ExtensionDirectories", c->extension_directories);
6188 }
6189
6190 bool exec_context_maintains_privileges(const ExecContext *c) {
6191         assert(c);
6192
6193         /* Returns true if the process forked off would run under
6194          * an unchanged UID or as root. */
6195
6196         if (!c->user)
6197                 return true;
6198
6199         if (streq(c->user, "root") || streq(c->user, "0"))
6200                 return true;
6201
6202         return false;
6203 }
6204
6205 int exec_context_get_effective_ioprio(const ExecContext *c) {
6206         int p;
6207
6208         assert(c);
6209
6210         if (c->ioprio_set)
6211                 return c->ioprio;
6212
6213         p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
6214         if (p < 0)
6215                 return IOPRIO_DEFAULT_CLASS_AND_PRIO;
6216
6217         return ioprio_normalize(p);
6218 }
6219
6220 bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
6221         assert(c);
6222
6223         /* Explicit setting wins */
6224         if (c->mount_apivfs_set)
6225                 return c->mount_apivfs;
6226
6227         /* Default to "yes" if root directory or image are specified */
6228         if (exec_context_with_rootfs(c))
6229                 return true;
6230
6231         return false;
6232 }
6233
6234 void exec_context_free_log_extra_fields(ExecContext *c) {
6235         assert(c);
6236
6237         for (size_t l = 0; l < c->n_log_extra_fields; l++)
6238                 free(c->log_extra_fields[l].iov_base);
6239         c->log_extra_fields = mfree(c->log_extra_fields);
6240         c->n_log_extra_fields = 0;
6241 }
6242
6243 void exec_context_revert_tty(ExecContext *c) {
6244         _cleanup_close_ int fd = -EBADF;
6245         const char *path;
6246         struct stat st;
6247         int r;
6248
6249         assert(c);
6250
6251         /* First, reset the TTY (possibly kicking everybody else from the TTY) */
6252         exec_context_tty_reset(c, NULL);
6253
6254         /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
6255          * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
6256          * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
6257         if (!exec_context_may_touch_tty(c))
6258                 return;
6259
6260         path = exec_context_tty_path(c);
6261         if (!path)
6262                 return;
6263
6264         fd = open(path, O_PATH|O_CLOEXEC);
6265         if (fd < 0)
6266                 return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
6267                                              "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
6268                                              path);
6269
6270         if (fstat(fd, &st) < 0)
6271                 return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
6272
6273         /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
6274          * if things are a character device, since a proper check either means we'd have to open the TTY and
6275          * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
6276          * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
6277          * with this at all? → https://github.com/systemd/systemd/issues/19213 */
6278         if (!S_ISCHR(st.st_mode))
6279                 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
6280
6281         r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
6282         if (r < 0)
6283                 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
6284 }
6285
6286 int exec_context_get_clean_directories(
6287                 ExecContext *c,
6288                 char **prefix,
6289                 ExecCleanMask mask,
6290                 char ***ret) {
6291
6292         _cleanup_strv_free_ char **l = NULL;
6293         int r;
6294
6295         assert(c);
6296         assert(prefix);
6297         assert(ret);
6298
6299         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6300                 if (!FLAGS_SET(mask, 1U << t))
6301                         continue;
6302
6303                 if (!prefix[t])
6304                         continue;
6305
6306                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
6307                         char *j;
6308
6309                         j = path_join(prefix[t], c->directories[t].items[i].path);
6310                         if (!j)
6311                                 return -ENOMEM;
6312
6313                         r = strv_consume(&l, j);
6314                         if (r < 0)
6315                                 return r;
6316
6317                         /* Also remove private directories unconditionally. */
6318                         if (t != EXEC_DIRECTORY_CONFIGURATION) {
6319                                 j = path_join(prefix[t], "private", c->directories[t].items[i].path);
6320                                 if (!j)
6321                                         return -ENOMEM;
6322
6323                                 r = strv_consume(&l, j);
6324                                 if (r < 0)
6325                                         return r;
6326                         }
6327
6328                         STRV_FOREACH(symlink, c->directories[t].items[i].symlinks) {
6329                                 j = path_join(prefix[t], *symlink);
6330                                 if (!j)
6331                                         return -ENOMEM;
6332
6333                                 r = strv_consume(&l, j);
6334                                 if (r < 0)
6335                                         return r;
6336                         }
6337                 }
6338         }
6339
6340         *ret = TAKE_PTR(l);
6341         return 0;
6342 }
6343
6344 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
6345         ExecCleanMask mask = 0;
6346
6347         assert(c);
6348         assert(ret);
6349
6350         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
6351                 if (c->directories[t].n_items > 0)
6352                         mask |= 1U << t;
6353
6354         *ret = mask;
6355         return 0;
6356 }
6357
6358 void exec_status_start(ExecStatus *s, pid_t pid) {
6359         assert(s);
6360
6361         *s = (ExecStatus) {
6362                 .pid = pid,
6363         };
6364
6365         dual_timestamp_get(&s->start_timestamp);
6366 }
6367
6368 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
6369         assert(s);
6370
6371         if (s->pid != pid)
6372                 *s = (ExecStatus) {
6373                         .pid = pid,
6374                 };
6375
6376         dual_timestamp_get(&s->exit_timestamp);
6377
6378         s->code = code;
6379         s->status = status;
6380
6381         if (context && context->utmp_id)
6382                 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
6383 }
6384
6385 void exec_status_reset(ExecStatus *s) {
6386         assert(s);
6387
6388         *s = (ExecStatus) {};
6389 }
6390
6391 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
6392         assert(s);
6393         assert(f);
6394
6395         if (s->pid <= 0)
6396                 return;
6397
6398         prefix = strempty(prefix);
6399
6400         fprintf(f,
6401                 "%sPID: "PID_FMT"\n",
6402                 prefix, s->pid);
6403
6404         if (dual_timestamp_is_set(&s->start_timestamp))
6405                 fprintf(f,
6406                         "%sStart Timestamp: %s\n",
6407                         prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
6408
6409         if (dual_timestamp_is_set(&s->exit_timestamp))
6410                 fprintf(f,
6411                         "%sExit Timestamp: %s\n"
6412                         "%sExit Code: %s\n"
6413                         "%sExit Status: %i\n",
6414                         prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime),
6415                         prefix, sigchld_code_to_string(s->code),
6416                         prefix, s->status);
6417 }
6418
6419 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
6420         _cleanup_free_ char *cmd = NULL;
6421         const char *prefix2;
6422
6423         assert(c);
6424         assert(f);
6425
6426         prefix = strempty(prefix);
6427         prefix2 = strjoina(prefix, "\t");
6428
6429         cmd = quote_command_line(c->argv, SHELL_ESCAPE_EMPTY);
6430
6431         fprintf(f,
6432                 "%sCommand Line: %s\n",
6433                 prefix, strnull(cmd));
6434
6435         exec_status_dump(&c->exec_status, f, prefix2);
6436 }
6437
6438 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
6439         assert(f);
6440
6441         prefix = strempty(prefix);
6442
6443         LIST_FOREACH(command, i, c)
6444                 exec_command_dump(i, f, prefix);
6445 }
6446
6447 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
6448         ExecCommand *end;
6449
6450         assert(l);
6451         assert(e);
6452
6453         if (*l) {
6454                 /* It's kind of important, that we keep the order here */
6455                 end = LIST_FIND_TAIL(command, *l);
6456                 LIST_INSERT_AFTER(command, *l, end, e);
6457         } else
6458                 *l = e;
6459 }
6460
6461 int exec_command_set(ExecCommand *c, const char *path, ...) {
6462         va_list ap;
6463         char **l, *p;
6464
6465         assert(c);
6466         assert(path);
6467
6468         va_start(ap, path);
6469         l = strv_new_ap(path, ap);
6470         va_end(ap);
6471
6472         if (!l)
6473                 return -ENOMEM;
6474
6475         p = strdup(path);
6476         if (!p) {
6477                 strv_free(l);
6478                 return -ENOMEM;
6479         }
6480
6481         free_and_replace(c->path, p);
6482
6483         return strv_free_and_replace(c->argv, l);
6484 }
6485
6486 int exec_command_append(ExecCommand *c, const char *path, ...) {
6487         _cleanup_strv_free_ char **l = NULL;
6488         va_list ap;
6489         int r;
6490
6491         assert(c);
6492         assert(path);
6493
6494         va_start(ap, path);
6495         l = strv_new_ap(path, ap);
6496         va_end(ap);
6497
6498         if (!l)
6499                 return -ENOMEM;
6500
6501         r = strv_extend_strv(&c->argv, l, false);
6502         if (r < 0)
6503                 return r;
6504
6505         return 0;
6506 }
6507
6508 static char *destroy_tree(char *path) {
6509         if (!path)
6510                 return NULL;
6511
6512         if (!path_equal(path, RUN_SYSTEMD_EMPTY)) {
6513                 log_debug("Spawning process to nuke '%s'", path);
6514
6515                 (void) asynchronous_rm_rf(path, REMOVE_ROOT|REMOVE_SUBVOLUME|REMOVE_PHYSICAL);
6516         }
6517
6518         return mfree(path);
6519 }
6520
6521 static ExecSharedRuntime* exec_shared_runtime_free(ExecSharedRuntime *rt) {
6522         if (!rt)
6523                 return NULL;
6524
6525         if (rt->manager)
6526                 (void) hashmap_remove(rt->manager->exec_shared_runtime_by_id, rt->id);
6527
6528         rt->id = mfree(rt->id);
6529         rt->tmp_dir = mfree(rt->tmp_dir);
6530         rt->var_tmp_dir = mfree(rt->var_tmp_dir);
6531         safe_close_pair(rt->netns_storage_socket);
6532         safe_close_pair(rt->ipcns_storage_socket);
6533         return mfree(rt);
6534 }
6535
6536 DEFINE_TRIVIAL_UNREF_FUNC(ExecSharedRuntime, exec_shared_runtime, exec_shared_runtime_free);
6537 DEFINE_TRIVIAL_CLEANUP_FUNC(ExecSharedRuntime*, exec_shared_runtime_free);
6538
6539 ExecSharedRuntime* exec_shared_runtime_destroy(ExecSharedRuntime *rt) {
6540         if (!rt)
6541                 return NULL;
6542
6543         assert(rt->n_ref > 0);
6544         rt->n_ref--;
6545
6546         if (rt->n_ref > 0)
6547                 return NULL;
6548
6549         rt->tmp_dir = destroy_tree(rt->tmp_dir);
6550         rt->var_tmp_dir = destroy_tree(rt->var_tmp_dir);
6551
6552         return exec_shared_runtime_free(rt);
6553 }
6554
6555 static int exec_shared_runtime_allocate(ExecSharedRuntime **ret, const char *id) {
6556         _cleanup_free_ char *id_copy = NULL;
6557         ExecSharedRuntime *n;
6558
6559         assert(ret);
6560
6561         id_copy = strdup(id);
6562         if (!id_copy)
6563                 return -ENOMEM;
6564
6565         n = new(ExecSharedRuntime, 1);
6566         if (!n)
6567                 return -ENOMEM;
6568
6569         *n = (ExecSharedRuntime) {
6570                 .id = TAKE_PTR(id_copy),
6571                 .netns_storage_socket = PIPE_EBADF,
6572                 .ipcns_storage_socket = PIPE_EBADF,
6573         };
6574
6575         *ret = n;
6576         return 0;
6577 }
6578
6579 static int exec_shared_runtime_add(
6580                 Manager *m,
6581                 const char *id,
6582                 char **tmp_dir,
6583                 char **var_tmp_dir,
6584                 int netns_storage_socket[2],
6585                 int ipcns_storage_socket[2],
6586                 ExecSharedRuntime **ret) {
6587
6588         _cleanup_(exec_shared_runtime_freep) ExecSharedRuntime *rt = NULL;
6589         int r;
6590
6591         assert(m);
6592         assert(id);
6593
6594         /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
6595
6596         r = exec_shared_runtime_allocate(&rt, id);
6597         if (r < 0)
6598                 return r;
6599
6600         r = hashmap_ensure_put(&m->exec_shared_runtime_by_id, &string_hash_ops, rt->id, rt);
6601         if (r < 0)
6602                 return r;
6603
6604         assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
6605         rt->tmp_dir = TAKE_PTR(*tmp_dir);
6606         rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
6607
6608         if (netns_storage_socket) {
6609                 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
6610                 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
6611         }
6612
6613         if (ipcns_storage_socket) {
6614                 rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
6615                 rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
6616         }
6617
6618         rt->manager = m;
6619
6620         if (ret)
6621                 *ret = rt;
6622         /* do not remove created ExecSharedRuntime object when the operation succeeds. */
6623         TAKE_PTR(rt);
6624         return 0;
6625 }
6626
6627 static int exec_shared_runtime_make(
6628                 Manager *m,
6629                 const ExecContext *c,
6630                 const char *id,
6631                 ExecSharedRuntime **ret) {
6632
6633         _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
6634         _cleanup_close_pair_ int netns_storage_socket[2] = PIPE_EBADF, ipcns_storage_socket[2] = PIPE_EBADF;
6635         int r;
6636
6637         assert(m);
6638         assert(c);
6639         assert(id);
6640
6641         /* It is not necessary to create ExecSharedRuntime object. */
6642         if (!exec_needs_network_namespace(c) && !exec_needs_ipc_namespace(c) && !c->private_tmp) {
6643                 *ret = NULL;
6644                 return 0;
6645         }
6646
6647         if (c->private_tmp &&
6648             !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
6649               (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
6650                prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
6651                 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
6652                 if (r < 0)
6653                         return r;
6654         }
6655
6656         if (exec_needs_network_namespace(c)) {
6657                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
6658                         return -errno;
6659         }
6660
6661         if (exec_needs_ipc_namespace(c)) {
6662                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
6663                         return -errno;
6664         }
6665
6666         r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
6667         if (r < 0)
6668                 return r;
6669
6670         return 1;
6671 }
6672
6673 int exec_shared_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecSharedRuntime **ret) {
6674         ExecSharedRuntime *rt;
6675         int r;
6676
6677         assert(m);
6678         assert(id);
6679         assert(ret);
6680
6681         rt = hashmap_get(m->exec_shared_runtime_by_id, id);
6682         if (rt)
6683                 /* We already have an ExecSharedRuntime object, let's increase the ref count and reuse it */
6684                 goto ref;
6685
6686         if (!create) {
6687                 *ret = NULL;
6688                 return 0;
6689         }
6690
6691         /* If not found, then create a new object. */
6692         r = exec_shared_runtime_make(m, c, id, &rt);
6693         if (r < 0)
6694                 return r;
6695         if (r == 0) {
6696                 /* When r == 0, it is not necessary to create ExecSharedRuntime object. */
6697                 *ret = NULL;
6698                 return 0;
6699         }
6700
6701 ref:
6702         /* increment reference counter. */
6703         rt->n_ref++;
6704         *ret = rt;
6705         return 1;
6706 }
6707
6708 int exec_shared_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
6709         ExecSharedRuntime *rt;
6710
6711         assert(m);
6712         assert(f);
6713         assert(fds);
6714
6715         HASHMAP_FOREACH(rt, m->exec_shared_runtime_by_id) {
6716                 fprintf(f, "exec-runtime=%s", rt->id);
6717
6718                 if (rt->tmp_dir)
6719                         fprintf(f, " tmp-dir=%s", rt->tmp_dir);
6720
6721                 if (rt->var_tmp_dir)
6722                         fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
6723
6724                 if (rt->netns_storage_socket[0] >= 0) {
6725                         int copy;
6726
6727                         copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
6728                         if (copy < 0)
6729                                 return copy;
6730
6731                         fprintf(f, " netns-socket-0=%i", copy);
6732                 }
6733
6734                 if (rt->netns_storage_socket[1] >= 0) {
6735                         int copy;
6736
6737                         copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
6738                         if (copy < 0)
6739                                 return copy;
6740
6741                         fprintf(f, " netns-socket-1=%i", copy);
6742                 }
6743
6744                 if (rt->ipcns_storage_socket[0] >= 0) {
6745                         int copy;
6746
6747                         copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
6748                         if (copy < 0)
6749                                 return copy;
6750
6751                         fprintf(f, " ipcns-socket-0=%i", copy);
6752                 }
6753
6754                 if (rt->ipcns_storage_socket[1] >= 0) {
6755                         int copy;
6756
6757                         copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
6758                         if (copy < 0)
6759                                 return copy;
6760
6761                         fprintf(f, " ipcns-socket-1=%i", copy);
6762                 }
6763
6764                 fputc('\n', f);
6765         }
6766
6767         return 0;
6768 }
6769
6770 int exec_shared_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
6771         _cleanup_(exec_shared_runtime_freep) ExecSharedRuntime *rt_create = NULL;
6772         ExecSharedRuntime *rt;
6773         int r;
6774
6775         /* This is for the migration from old (v237 or earlier) deserialization text.
6776          * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
6777          * Even if the ExecSharedRuntime object originally created by the other unit, we cannot judge
6778          * so or not from the serialized text, then we always creates a new object owned by this. */
6779
6780         assert(u);
6781         assert(key);
6782         assert(value);
6783
6784         /* Manager manages ExecSharedRuntime objects by the unit id.
6785          * So, we omit the serialized text when the unit does not have id (yet?)... */
6786         if (isempty(u->id)) {
6787                 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
6788                 return 0;
6789         }
6790
6791         if (hashmap_ensure_allocated(&u->manager->exec_shared_runtime_by_id, &string_hash_ops) < 0)
6792                 return log_oom();
6793
6794         rt = hashmap_get(u->manager->exec_shared_runtime_by_id, u->id);
6795         if (!rt) {
6796                 if (exec_shared_runtime_allocate(&rt_create, u->id) < 0)
6797                         return log_oom();
6798
6799                 rt = rt_create;
6800         }
6801
6802         if (streq(key, "tmp-dir")) {
6803                 if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
6804                         return -ENOMEM;
6805
6806         } else if (streq(key, "var-tmp-dir")) {
6807                 if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
6808                         return -ENOMEM;
6809
6810         } else if (streq(key, "netns-socket-0")) {
6811                 int fd;
6812
6813                 if ((fd = parse_fd(value)) < 0 || !fdset_contains(fds, fd)) {
6814                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
6815                         return 0;
6816                 }
6817
6818                 safe_close(rt->netns_storage_socket[0]);
6819                 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
6820
6821         } else if (streq(key, "netns-socket-1")) {
6822                 int fd;
6823
6824                 if ((fd = parse_fd(value)) < 0 || !fdset_contains(fds, fd)) {
6825                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
6826                         return 0;
6827                 }
6828
6829                 safe_close(rt->netns_storage_socket[1]);
6830                 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
6831
6832         } else
6833                 return 0;
6834
6835         /* If the object is newly created, then put it to the hashmap which manages ExecSharedRuntime objects. */
6836         if (rt_create) {
6837                 r = hashmap_put(u->manager->exec_shared_runtime_by_id, rt_create->id, rt_create);
6838                 if (r < 0) {
6839                         log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
6840                         return 0;
6841                 }
6842
6843                 rt_create->manager = u->manager;
6844
6845                 /* Avoid cleanup */
6846                 TAKE_PTR(rt_create);
6847         }
6848
6849         return 1;
6850 }
6851
6852 int exec_shared_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
6853         _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
6854         char *id = NULL;
6855         int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
6856         const char *p, *v = ASSERT_PTR(value);
6857         size_t n;
6858
6859         assert(m);
6860         assert(fds);
6861
6862         n = strcspn(v, " ");
6863         id = strndupa_safe(v, n);
6864         if (v[n] != ' ')
6865                 goto finalize;
6866         p = v + n + 1;
6867
6868         v = startswith(p, "tmp-dir=");
6869         if (v) {
6870                 n = strcspn(v, " ");
6871                 tmp_dir = strndup(v, n);
6872                 if (!tmp_dir)
6873                         return log_oom();
6874                 if (v[n] != ' ')
6875                         goto finalize;
6876                 p = v + n + 1;
6877         }
6878
6879         v = startswith(p, "var-tmp-dir=");
6880         if (v) {
6881                 n = strcspn(v, " ");
6882                 var_tmp_dir = strndup(v, n);
6883                 if (!var_tmp_dir)
6884                         return log_oom();
6885                 if (v[n] != ' ')
6886                         goto finalize;
6887                 p = v + n + 1;
6888         }
6889
6890         v = startswith(p, "netns-socket-0=");
6891         if (v) {
6892                 char *buf;
6893
6894                 n = strcspn(v, " ");
6895                 buf = strndupa_safe(v, n);
6896
6897                 netns_fdpair[0] = parse_fd(buf);
6898                 if (netns_fdpair[0] < 0)
6899                         return log_debug_errno(netns_fdpair[0], "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
6900                 if (!fdset_contains(fds, netns_fdpair[0]))
6901                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6902                                                "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair[0]);
6903                 netns_fdpair[0] = fdset_remove(fds, netns_fdpair[0]);
6904                 if (v[n] != ' ')
6905                         goto finalize;
6906                 p = v + n + 1;
6907         }
6908
6909         v = startswith(p, "netns-socket-1=");
6910         if (v) {
6911                 char *buf;
6912
6913                 n = strcspn(v, " ");
6914                 buf = strndupa_safe(v, n);
6915
6916                 netns_fdpair[1] = parse_fd(buf);
6917                 if (netns_fdpair[1] < 0)
6918                         return log_debug_errno(netns_fdpair[1], "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
6919                 if (!fdset_contains(fds, netns_fdpair[1]))
6920                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6921                                                "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair[1]);
6922                 netns_fdpair[1] = fdset_remove(fds, netns_fdpair[1]);
6923                 if (v[n] != ' ')
6924                         goto finalize;
6925                 p = v + n + 1;
6926         }
6927
6928         v = startswith(p, "ipcns-socket-0=");
6929         if (v) {
6930                 char *buf;
6931
6932                 n = strcspn(v, " ");
6933                 buf = strndupa_safe(v, n);
6934
6935                 ipcns_fdpair[0] = parse_fd(buf);
6936                 if (ipcns_fdpair[0] < 0)
6937                         return log_debug_errno(ipcns_fdpair[0], "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf);
6938                 if (!fdset_contains(fds, ipcns_fdpair[0]))
6939                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6940                                                "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair[0]);
6941                 ipcns_fdpair[0] = fdset_remove(fds, ipcns_fdpair[0]);
6942                 if (v[n] != ' ')
6943                         goto finalize;
6944                 p = v + n + 1;
6945         }
6946
6947         v = startswith(p, "ipcns-socket-1=");
6948         if (v) {
6949                 char *buf;
6950
6951                 n = strcspn(v, " ");
6952                 buf = strndupa_safe(v, n);
6953
6954                 ipcns_fdpair[1] = parse_fd(buf);
6955                 if (ipcns_fdpair[1] < 0)
6956                         return log_debug_errno(ipcns_fdpair[1], "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf);
6957                 if (!fdset_contains(fds, ipcns_fdpair[1]))
6958                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6959                                                "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair[1]);
6960                 ipcns_fdpair[1] = fdset_remove(fds, ipcns_fdpair[1]);
6961         }
6962
6963 finalize:
6964         r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
6965         if (r < 0)
6966                 return log_debug_errno(r, "Failed to add exec-runtime: %m");
6967         return 0;
6968 }
6969
6970 void exec_shared_runtime_vacuum(Manager *m) {
6971         ExecSharedRuntime *rt;
6972
6973         assert(m);
6974
6975         /* Free unreferenced ExecSharedRuntime objects. This is used after manager deserialization process. */
6976
6977         HASHMAP_FOREACH(rt, m->exec_shared_runtime_by_id) {
6978                 if (rt->n_ref > 0)
6979                         continue;
6980
6981                 (void) exec_shared_runtime_free(rt);
6982         }
6983 }
6984
6985 int exec_runtime_make(
6986                 const Unit *unit,
6987                 const ExecContext *context,
6988                 ExecSharedRuntime *shared,
6989                 DynamicCreds *creds,
6990                 ExecRuntime **ret) {
6991         _cleanup_close_pair_ int ephemeral_storage_socket[2] = PIPE_EBADF;
6992         _cleanup_free_ char *ephemeral = NULL;
6993         _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
6994         int r;
6995
6996         assert(unit);
6997         assert(context);
6998         assert(ret);
6999
7000         if (!shared && !creds && !exec_needs_ephemeral(context)) {
7001                 *ret = NULL;
7002                 return 0;
7003         }
7004
7005         if (exec_needs_ephemeral(context)) {
7006                 r = mkdir_p("/var/lib/systemd/ephemeral-trees", 0755);
7007                 if (r < 0)
7008                         return r;
7009
7010                 r = tempfn_random_child("/var/lib/systemd/ephemeral-trees", unit->id, &ephemeral);
7011                 if (r < 0)
7012                         return r;
7013
7014                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ephemeral_storage_socket) < 0)
7015                         return -errno;
7016         }
7017
7018         rt = new(ExecRuntime, 1);
7019         if (!rt)
7020                 return -ENOMEM;
7021
7022         *rt = (ExecRuntime) {
7023                 .shared = shared,
7024                 .dynamic_creds = creds,
7025                 .ephemeral_copy = TAKE_PTR(ephemeral),
7026                 .ephemeral_storage_socket[0] = TAKE_FD(ephemeral_storage_socket[0]),
7027                 .ephemeral_storage_socket[1] = TAKE_FD(ephemeral_storage_socket[1]),
7028         };
7029
7030         *ret = TAKE_PTR(rt);
7031         return 1;
7032 }
7033
7034 ExecRuntime* exec_runtime_free(ExecRuntime *rt) {
7035         if (!rt)
7036                 return NULL;
7037
7038         exec_shared_runtime_unref(rt->shared);
7039         dynamic_creds_unref(rt->dynamic_creds);
7040
7041         rt->ephemeral_copy = destroy_tree(rt->ephemeral_copy);
7042
7043         safe_close_pair(rt->ephemeral_storage_socket);
7044         return mfree(rt);
7045 }
7046
7047 ExecRuntime* exec_runtime_destroy(ExecRuntime *rt) {
7048         if (!rt)
7049                 return NULL;
7050
7051         rt->shared = exec_shared_runtime_destroy(rt->shared);
7052         rt->dynamic_creds = dynamic_creds_destroy(rt->dynamic_creds);
7053         return exec_runtime_free(rt);
7054 }
7055
7056 void exec_params_clear(ExecParameters *p) {
7057         if (!p)
7058                 return;
7059
7060         p->environment = strv_free(p->environment);
7061         p->fd_names = strv_free(p->fd_names);
7062         p->fds = mfree(p->fds);
7063         p->exec_fd = safe_close(p->exec_fd);
7064 }
7065
7066 void exec_directory_done(ExecDirectory *d) {
7067         if (!d)
7068                 return;
7069
7070         for (size_t i = 0; i < d->n_items; i++) {
7071                 free(d->items[i].path);
7072                 strv_free(d->items[i].symlinks);
7073         }
7074
7075         d->items = mfree(d->items);
7076         d->n_items = 0;
7077         d->mode = 0755;
7078 }
7079
7080 static ExecDirectoryItem *exec_directory_find(ExecDirectory *d, const char *path) {
7081         assert(d);
7082         assert(path);
7083
7084         for (size_t i = 0; i < d->n_items; i++)
7085                 if (path_equal(d->items[i].path, path))
7086                         return &d->items[i];
7087
7088         return NULL;
7089 }
7090
7091 int exec_directory_add(ExecDirectory *d, const char *path, const char *symlink) {
7092         _cleanup_strv_free_ char **s = NULL;
7093         _cleanup_free_ char *p = NULL;
7094         ExecDirectoryItem *existing;
7095         int r;
7096
7097         assert(d);
7098         assert(path);
7099
7100         existing = exec_directory_find(d, path);
7101         if (existing) {
7102                 r = strv_extend(&existing->symlinks, symlink);
7103                 if (r < 0)
7104                         return r;
7105
7106                 return 0; /* existing item is updated */
7107         }
7108
7109         p = strdup(path);
7110         if (!p)
7111                 return -ENOMEM;
7112
7113         if (symlink) {
7114                 s = strv_new(symlink);
7115                 if (!s)
7116                         return -ENOMEM;
7117         }
7118
7119         if (!GREEDY_REALLOC(d->items, d->n_items + 1))
7120                 return -ENOMEM;
7121
7122         d->items[d->n_items++] = (ExecDirectoryItem) {
7123                 .path = TAKE_PTR(p),
7124                 .symlinks = TAKE_PTR(s),
7125         };
7126
7127         return 1; /* new item is added */
7128 }
7129
7130 static int exec_directory_item_compare_func(const ExecDirectoryItem *a, const ExecDirectoryItem *b) {
7131         assert(a);
7132         assert(b);
7133
7134         return path_compare(a->path, b->path);
7135 }
7136
7137 void exec_directory_sort(ExecDirectory *d) {
7138         assert(d);
7139
7140         /* Sort the exec directories to make always parent directories processed at first in
7141          * setup_exec_directory(), e.g., even if StateDirectory=foo/bar foo, we need to create foo at first,
7142          * then foo/bar. Also, set .only_create flag if one of the parent directories is contained in the
7143          * list. See also comments in setup_exec_directory() and issue #24783. */
7144
7145         if (d->n_items <= 1)
7146                 return;
7147
7148         typesafe_qsort(d->items, d->n_items, exec_directory_item_compare_func);
7149
7150         for (size_t i = 1; i < d->n_items; i++)
7151                 for (size_t j = 0; j < i; j++)
7152                         if (path_startswith(d->items[i].path, d->items[j].path)) {
7153                                 d->items[i].only_create = true;
7154                                 break;
7155                         }
7156 }
7157
7158 ExecCleanMask exec_clean_mask_from_string(const char *s) {
7159         ExecDirectoryType t;
7160
7161         assert(s);
7162
7163         if (streq(s, "all"))
7164                 return EXEC_CLEAN_ALL;
7165         if (streq(s, "fdstore"))
7166                 return EXEC_CLEAN_FDSTORE;
7167
7168         t = exec_resource_type_from_string(s);
7169         if (t < 0)
7170                 return (ExecCleanMask) t;
7171
7172         return 1U << t;
7173 }
7174
7175 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
7176         [EXEC_INPUT_NULL] = "null",
7177         [EXEC_INPUT_TTY] = "tty",
7178         [EXEC_INPUT_TTY_FORCE] = "tty-force",
7179         [EXEC_INPUT_TTY_FAIL] = "tty-fail",
7180         [EXEC_INPUT_SOCKET] = "socket",
7181         [EXEC_INPUT_NAMED_FD] = "fd",
7182         [EXEC_INPUT_DATA] = "data",
7183         [EXEC_INPUT_FILE] = "file",
7184 };
7185
7186 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
7187
7188 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
7189         [EXEC_OUTPUT_INHERIT] = "inherit",
7190         [EXEC_OUTPUT_NULL] = "null",
7191         [EXEC_OUTPUT_TTY] = "tty",
7192         [EXEC_OUTPUT_KMSG] = "kmsg",
7193         [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
7194         [EXEC_OUTPUT_JOURNAL] = "journal",
7195         [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
7196         [EXEC_OUTPUT_SOCKET] = "socket",
7197         [EXEC_OUTPUT_NAMED_FD] = "fd",
7198         [EXEC_OUTPUT_FILE] = "file",
7199         [EXEC_OUTPUT_FILE_APPEND] = "append",
7200         [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
7201 };
7202
7203 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
7204
7205 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
7206         [EXEC_UTMP_INIT] = "init",
7207         [EXEC_UTMP_LOGIN] = "login",
7208         [EXEC_UTMP_USER] = "user",
7209 };
7210
7211 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
7212
7213 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
7214         [EXEC_PRESERVE_NO] = "no",
7215         [EXEC_PRESERVE_YES] = "yes",
7216         [EXEC_PRESERVE_RESTART] = "restart",
7217 };
7218
7219 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
7220
7221 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
7222 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7223         [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
7224         [EXEC_DIRECTORY_STATE] = "StateDirectory",
7225         [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
7226         [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
7227         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
7228 };
7229
7230 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
7231
7232 /* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
7233 static const char* const exec_directory_type_symlink_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7234         [EXEC_DIRECTORY_RUNTIME]       = "RuntimeDirectorySymlink",
7235         [EXEC_DIRECTORY_STATE]         = "StateDirectorySymlink",
7236         [EXEC_DIRECTORY_CACHE]         = "CacheDirectorySymlink",
7237         [EXEC_DIRECTORY_LOGS]          = "LogsDirectorySymlink",
7238         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectorySymlink",
7239 };
7240
7241 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink, ExecDirectoryType);
7242
7243 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
7244  * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
7245  * directories, specifically .timer units with their timestamp touch file. */
7246 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7247         [EXEC_DIRECTORY_RUNTIME] = "runtime",
7248         [EXEC_DIRECTORY_STATE] = "state",
7249         [EXEC_DIRECTORY_CACHE] = "cache",
7250         [EXEC_DIRECTORY_LOGS] = "logs",
7251         [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
7252 };
7253
7254 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
7255
7256 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
7257  * the service payload in. */
7258 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7259         [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
7260         [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
7261         [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
7262         [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
7263         [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
7264 };
7265
7266 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
7267
7268 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
7269         [EXEC_KEYRING_INHERIT] = "inherit",
7270         [EXEC_KEYRING_PRIVATE] = "private",
7271         [EXEC_KEYRING_SHARED] = "shared",
7272 };
7273
7274 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);