src/core/execute.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <errno.h>
   4 #include <fcntl.h>
   5 #include <poll.h>
   6 #include <sys/eventfd.h>
   7 #include <sys/file.h>
   8 #include <sys/ioctl.h>
   9 #include <sys/mman.h>
  10 #include <sys/personality.h>
  11 #include <sys/prctl.h>
  12 #include <sys/shm.h>
  13 #include <sys/types.h>
  14 #include <sys/un.h>
  15 #include <unistd.h>
  16 #include <utmpx.h>
  17
  18 #include <linux/fs.h> /* Must be included after <sys/mount.h> */
  19
  20 #if HAVE_PAM
  21 #include <security/pam_appl.h>
  22 #endif
  23
  24 #if HAVE_SELINUX
  25 #include <selinux/selinux.h>
  26 #endif
  27
  28 #if HAVE_APPARMOR
  29 #include <sys/apparmor.h>
  30 #endif
  31
  32 #include "sd-messages.h"
  33
  34 #include "af-list.h"
  35 #include "alloc-util.h"
  36 #if HAVE_APPARMOR
  37 #include "apparmor-util.h"
  38 #endif
  39 #include "argv-util.h"
  40 #include "async.h"
  41 #include "barrier.h"
  42 #include "bpf-lsm.h"
  43 #include "btrfs-util.h"
  44 #include "cap-list.h"
  45 #include "capability-util.h"
  46 #include "chattr-util.h"
  47 #include "cgroup-setup.h"
  48 #include "chase.h"
  49 #include "chown-recursive.h"
  50 #include "constants.h"
  51 #include "cpu-set-util.h"
  52 #include "data-fd-util.h"
  53 #include "env-file.h"
  54 #include "env-util.h"
  55 #include "errno-list.h"
  56 #include "escape.h"
  57 #include "exec-credential.h"
  58 #include "execute.h"
  59 #include "exit-status.h"
  60 #include "fd-util.h"
  61 #include "format-util.h"
  62 #include "glob-util.h"
  63 #include "hexdecoct.h"
  64 #include "io-util.h"
  65 #include "ioprio-util.h"
  66 #include "lock-util.h"
  67 #include "log.h"
  68 #include "macro.h"
  69 #include "manager.h"
  70 #include "manager-dump.h"
  71 #include "memory-util.h"
  72 #include "missing_fs.h"
  73 #include "missing_ioprio.h"
  74 #include "missing_prctl.h"
  75 #include "mkdir-label.h"
  76 #include "namespace.h"
  77 #include "parse-util.h"
  78 #include "path-util.h"
  79 #include "proc-cmdline.h"
  80 #include "process-util.h"
  81 #include "psi-util.h"
  82 #include "rlimit-util.h"
  83 #include "rm-rf.h"
  84 #include "seccomp-util.h"
  85 #include "securebits-util.h"
  86 #include "selinux-util.h"
  87 #include "signal-util.h"
  88 #include "smack-util.h"
  89 #include "socket-util.h"
  90 #include "sort-util.h"
  91 #include "special.h"
  92 #include "stat-util.h"
  93 #include "string-table.h"
  94 #include "string-util.h"
  95 #include "strv.h"
  96 #include "syslog-util.h"
  97 #include "terminal-util.h"
  98 #include "tmpfile-util.h"
  99 #include "umask-util.h"
 100 #include "unit-serialize.h"
 101 #include "user-util.h"
 102 #include "utmp-wtmp.h"
 103
 104 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
 105 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
 106
 107 #define SNDBUF_SIZE (8*1024*1024)
 108
 109 static int shift_fds(int fds[], size_t n_fds) {
 110         if (n_fds <= 0)
 111                 return 0;
 112
 113         /* Modifies the fds array! (sorts it) */
 114
 115         assert(fds);
 116
 117         for (int start = 0;;) {
 118                 int restart_from = -1;
 119
 120                 for (int i = start; i < (int) n_fds; i++) {
 121                         int nfd;
 122
 123                         /* Already at right index? */
 124                         if (fds[i] == i+3)
 125                                 continue;
 126
 127                         nfd = fcntl(fds[i], F_DUPFD, i + 3);
 128                         if (nfd < 0)
 129                                 return -errno;
 130
 131                         safe_close(fds[i]);
 132                         fds[i] = nfd;
 133
 134                         /* Hmm, the fd we wanted isn't free? Then
 135                          * let's remember that and try again from here */
 136                         if (nfd != i+3 && restart_from < 0)
 137                                 restart_from = i;
 138                 }
 139
 140                 if (restart_from < 0)
 141                         break;
 142
 143                 start = restart_from;
 144         }
 145
 146         return 0;
 147 }
 148
 149 static int flags_fds(
 150                 const int fds[],
 151                 size_t n_socket_fds,
 152                 size_t n_fds,
 153                 bool nonblock) {
 154
 155         int r;
 156
 157         if (n_fds <= 0)
 158                 return 0;
 159
 160         assert(fds);
 161
 162         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
 163          * O_NONBLOCK only applies to socket activation though. */
 164
 165         for (size_t i = 0; i < n_fds; i++) {
 166
 167                 if (i < n_socket_fds) {
 168                         r = fd_nonblock(fds[i], nonblock);
 169                         if (r < 0)
 170                                 return r;
 171                 }
 172
 173                 /* We unconditionally drop FD_CLOEXEC from the fds,
 174                  * since after all we want to pass these fds to our
 175                  * children */
 176
 177                 r = fd_cloexec(fds[i], false);
 178                 if (r < 0)
 179                         return r;
 180         }
 181
 182         return 0;
 183 }
 184
 185 static const char *exec_context_tty_path(const ExecContext *context) {
 186         assert(context);
 187
 188         if (context->stdio_as_fds)
 189                 return NULL;
 190
 191         if (context->tty_path)
 192                 return context->tty_path;
 193
 194         return "/dev/console";
 195 }
 196
 197 static int exec_context_tty_size(const ExecContext *context, unsigned *ret_rows, unsigned *ret_cols) {
 198         unsigned rows, cols;
 199         const char *tty;
 200
 201         assert(context);
 202         assert(ret_rows);
 203         assert(ret_cols);
 204
 205         rows = context->tty_rows;
 206         cols = context->tty_cols;
 207
 208         tty = exec_context_tty_path(context);
 209         if (tty)
 210                 (void) proc_cmdline_tty_size(tty, rows == UINT_MAX ? &rows : NULL, cols == UINT_MAX ? &cols : NULL);
 211
 212         *ret_rows = rows;
 213         *ret_cols = cols;
 214
 215         return 0;
 216 }
 217
 218 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
 219         _cleanup_close_ int fd = -EBADF;
 220         const char *path = exec_context_tty_path(ASSERT_PTR(context));
 221
 222         /* Take a lock around the device for the duration of the setup that we do here.
 223          * systemd-vconsole-setup.service also takes the lock to avoid being interrupted.
 224          * We open a new fd that will be closed automatically, and operate on it for convenience.
 225          */
 226
 227         if (p && p->stdin_fd >= 0) {
 228                 fd = xopenat_lock(p->stdin_fd, NULL,
 229                                   O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY, 0, 0, LOCK_BSD, LOCK_EX);
 230                 if (fd < 0)
 231                         return;
 232         } else if (path) {
 233                 fd = open_terminal(path, O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK);
 234                 if (fd < 0)
 235                         return;
 236
 237                 if (lock_generic(fd, LOCK_BSD, LOCK_EX) < 0)
 238                         return;
 239         } else
 240                 return;   /* nothing to do */
 241
 242         if (context->tty_vhangup)
 243                 (void) terminal_vhangup_fd(fd);
 244
 245         if (context->tty_reset)
 246                 (void) reset_terminal_fd(fd, true);
 247
 248         if (p && p->stdin_fd >= 0) {
 249                 unsigned rows = context->tty_rows, cols = context->tty_cols;
 250
 251                 (void) exec_context_tty_size(context, &rows, &cols);
 252                 (void) terminal_set_size_fd(p->stdin_fd, path, rows, cols);
 253         }
 254
 255         if (context->tty_vt_disallocate && path)
 256                 (void) vt_disallocate(path);
 257 }
 258
 259 static bool is_terminal_input(ExecInput i) {
 260         return IN_SET(i,
 261                       EXEC_INPUT_TTY,
 262                       EXEC_INPUT_TTY_FORCE,
 263                       EXEC_INPUT_TTY_FAIL);
 264 }
 265
 266 static bool is_terminal_output(ExecOutput o) {
 267         return IN_SET(o,
 268                       EXEC_OUTPUT_TTY,
 269                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 270                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 271 }
 272
 273 static bool is_kmsg_output(ExecOutput o) {
 274         return IN_SET(o,
 275                       EXEC_OUTPUT_KMSG,
 276                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 277 }
 278
 279 static bool exec_context_needs_term(const ExecContext *c) {
 280         assert(c);
 281
 282         /* Return true if the execution context suggests we should set $TERM to something useful. */
 283
 284         if (is_terminal_input(c->std_input))
 285                 return true;
 286
 287         if (is_terminal_output(c->std_output))
 288                 return true;
 289
 290         if (is_terminal_output(c->std_error))
 291                 return true;
 292
 293         return !!c->tty_path;
 294 }
 295
 296 static int open_null_as(int flags, int nfd) {
 297         int fd;
 298
 299         assert(nfd >= 0);
 300
 301         fd = open("/dev/null", flags|O_NOCTTY);
 302         if (fd < 0)
 303                 return -errno;
 304
 305         return move_fd(fd, nfd, false);
 306 }
 307
 308 static int connect_journal_socket(
 309                 int fd,
 310                 const char *log_namespace,
 311                 uid_t uid,
 312                 gid_t gid) {
 313
 314         uid_t olduid = UID_INVALID;
 315         gid_t oldgid = GID_INVALID;
 316         const char *j;
 317         int r;
 318
 319         j = log_namespace ?
 320                 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
 321                 "/run/systemd/journal/stdout";
 322
 323         if (gid_is_valid(gid)) {
 324                 oldgid = getgid();
 325
 326                 if (setegid(gid) < 0)
 327                         return -errno;
 328         }
 329
 330         if (uid_is_valid(uid)) {
 331                 olduid = getuid();
 332
 333                 if (seteuid(uid) < 0) {
 334                         r = -errno;
 335                         goto restore_gid;
 336                 }
 337         }
 338
 339         r = connect_unix_path(fd, AT_FDCWD, j);
 340
 341         /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
 342            an LSM interferes. */
 343
 344         if (uid_is_valid(uid))
 345                 (void) seteuid(olduid);
 346
 347  restore_gid:
 348         if (gid_is_valid(gid))
 349                 (void) setegid(oldgid);
 350
 351         return r;
 352 }
 353
 354 static int connect_logger_as(
 355                 const Unit *unit,
 356                 const ExecContext *context,
 357                 const ExecParameters *params,
 358                 ExecOutput output,
 359                 const char *ident,
 360                 int nfd,
 361                 uid_t uid,
 362                 gid_t gid) {
 363
 364         _cleanup_close_ int fd = -EBADF;
 365         int r;
 366
 367         assert(context);
 368         assert(params);
 369         assert(output < _EXEC_OUTPUT_MAX);
 370         assert(ident);
 371         assert(nfd >= 0);
 372
 373         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 374         if (fd < 0)
 375                 return -errno;
 376
 377         r = connect_journal_socket(fd, context->log_namespace, uid, gid);
 378         if (r < 0)
 379                 return r;
 380
 381         if (shutdown(fd, SHUT_RD) < 0)
 382                 return -errno;
 383
 384         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 385
 386         if (dprintf(fd,
 387                 "%s\n"
 388                 "%s\n"
 389                 "%i\n"
 390                 "%i\n"
 391                 "%i\n"
 392                 "%i\n"
 393                 "%i\n",
 394                 context->syslog_identifier ?: ident,
 395                 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
 396                 context->syslog_priority,
 397                 !!context->syslog_level_prefix,
 398                 false,
 399                 is_kmsg_output(output),
 400                 is_terminal_output(output)) < 0)
 401                 return -errno;
 402
 403         return move_fd(TAKE_FD(fd), nfd, false);
 404 }
 405
 406 static int open_terminal_as(const char *path, int flags, int nfd) {
 407         int fd;
 408
 409         assert(path);
 410         assert(nfd >= 0);
 411
 412         fd = open_terminal(path, flags | O_NOCTTY);
 413         if (fd < 0)
 414                 return fd;
 415
 416         return move_fd(fd, nfd, false);
 417 }
 418
 419 static int acquire_path(const char *path, int flags, mode_t mode) {
 420         _cleanup_close_ int fd = -EBADF;
 421         int r;
 422
 423         assert(path);
 424
 425         if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
 426                 flags |= O_CREAT;
 427
 428         fd = open(path, flags|O_NOCTTY, mode);
 429         if (fd >= 0)
 430                 return TAKE_FD(fd);
 431
 432         if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
 433                 return -errno;
 434
 435         /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
 436
 437         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 438         if (fd < 0)
 439                 return -errno;
 440
 441         r = connect_unix_path(fd, AT_FDCWD, path);
 442         if (IN_SET(r, -ENOTSOCK, -EINVAL))
 443                 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
 444                  * wasn't an AF_UNIX socket after all */
 445                 return -ENXIO;
 446         if (r < 0)
 447                 return r;
 448
 449         if ((flags & O_ACCMODE) == O_RDONLY)
 450                 r = shutdown(fd, SHUT_WR);
 451         else if ((flags & O_ACCMODE) == O_WRONLY)
 452                 r = shutdown(fd, SHUT_RD);
 453         else
 454                 r = 0;
 455         if (r < 0)
 456                 return -errno;
 457
 458         return TAKE_FD(fd);
 459 }
 460
 461 static int fixup_input(
 462                 const ExecContext *context,
 463                 int socket_fd,
 464                 bool apply_tty_stdin) {
 465
 466         ExecInput std_input;
 467
 468         assert(context);
 469
 470         std_input = context->std_input;
 471
 472         if (is_terminal_input(std_input) && !apply_tty_stdin)
 473                 return EXEC_INPUT_NULL;
 474
 475         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 476                 return EXEC_INPUT_NULL;
 477
 478         if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
 479                 return EXEC_INPUT_NULL;
 480
 481         return std_input;
 482 }
 483
 484 static int fixup_output(ExecOutput output, int socket_fd) {
 485
 486         if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 487                 return EXEC_OUTPUT_INHERIT;
 488
 489         return output;
 490 }
 491
 492 static int setup_input(
 493                 const ExecContext *context,
 494                 const ExecParameters *params,
 495                 int socket_fd,
 496                 const int named_iofds[static 3]) {
 497
 498         ExecInput i;
 499         int r;
 500
 501         assert(context);
 502         assert(params);
 503         assert(named_iofds);
 504
 505         if (params->stdin_fd >= 0) {
 506                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 507                         return -errno;
 508
 509                 /* Try to make this the controlling tty, if it is a tty, and reset it */
 510                 if (isatty(STDIN_FILENO)) {
 511                         unsigned rows = context->tty_rows, cols = context->tty_cols;
 512
 513                         (void) exec_context_tty_size(context, &rows, &cols);
 514                         (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 515                         (void) reset_terminal_fd(STDIN_FILENO, true);
 516                         (void) terminal_set_size_fd(STDIN_FILENO, NULL, rows, cols);
 517                 }
 518
 519                 return STDIN_FILENO;
 520         }
 521
 522         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 523
 524         switch (i) {
 525
 526         case EXEC_INPUT_NULL:
 527                 return open_null_as(O_RDONLY, STDIN_FILENO);
 528
 529         case EXEC_INPUT_TTY:
 530         case EXEC_INPUT_TTY_FORCE:
 531         case EXEC_INPUT_TTY_FAIL: {
 532                 unsigned rows, cols;
 533                 int fd;
 534
 535                 fd = acquire_terminal(exec_context_tty_path(context),
 536                                       i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
 537                                       i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
 538                                                                   ACQUIRE_TERMINAL_WAIT,
 539                                       USEC_INFINITY);
 540                 if (fd < 0)
 541                         return fd;
 542
 543                 r = exec_context_tty_size(context, &rows, &cols);
 544                 if (r < 0)
 545                         return r;
 546
 547                 r = terminal_set_size_fd(fd, exec_context_tty_path(context), rows, cols);
 548                 if (r < 0)
 549                         return r;
 550
 551                 return move_fd(fd, STDIN_FILENO, false);
 552         }
 553
 554         case EXEC_INPUT_SOCKET:
 555                 assert(socket_fd >= 0);
 556
 557                 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
 558
 559         case EXEC_INPUT_NAMED_FD:
 560                 assert(named_iofds[STDIN_FILENO] >= 0);
 561
 562                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 563                 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
 564
 565         case EXEC_INPUT_DATA: {
 566                 int fd;
 567
 568                 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
 569                 if (fd < 0)
 570                         return fd;
 571
 572                 return move_fd(fd, STDIN_FILENO, false);
 573         }
 574
 575         case EXEC_INPUT_FILE: {
 576                 bool rw;
 577                 int fd;
 578
 579                 assert(context->stdio_file[STDIN_FILENO]);
 580
 581                 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
 582                         (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
 583
 584                 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
 585                 if (fd < 0)
 586                         return fd;
 587
 588                 return move_fd(fd, STDIN_FILENO, false);
 589         }
 590
 591         default:
 592                 assert_not_reached();
 593         }
 594 }
 595
 596 static bool can_inherit_stderr_from_stdout(
 597                 const ExecContext *context,
 598                 ExecOutput o,
 599                 ExecOutput e) {
 600
 601         assert(context);
 602
 603         /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
 604          * stderr fd */
 605
 606         if (e == EXEC_OUTPUT_INHERIT)
 607                 return true;
 608         if (e != o)
 609                 return false;
 610
 611         if (e == EXEC_OUTPUT_NAMED_FD)
 612                 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
 613
 614         if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
 615                 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
 616
 617         return true;
 618 }
 619
 620 static int setup_output(
 621                 const Unit *unit,
 622                 const ExecContext *context,
 623                 const ExecParameters *params,
 624                 int fileno,
 625                 int socket_fd,
 626                 const int named_iofds[static 3],
 627                 const char *ident,
 628                 uid_t uid,
 629                 gid_t gid,
 630                 dev_t *journal_stream_dev,
 631                 ino_t *journal_stream_ino) {
 632
 633         ExecOutput o;
 634         ExecInput i;
 635         int r;
 636
 637         assert(unit);
 638         assert(context);
 639         assert(params);
 640         assert(ident);
 641         assert(journal_stream_dev);
 642         assert(journal_stream_ino);
 643
 644         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 645
 646                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 647                         return -errno;
 648
 649                 return STDOUT_FILENO;
 650         }
 651
 652         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 653                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 654                         return -errno;
 655
 656                 return STDERR_FILENO;
 657         }
 658
 659         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 660         o = fixup_output(context->std_output, socket_fd);
 661
 662         if (fileno == STDERR_FILENO) {
 663                 ExecOutput e;
 664                 e = fixup_output(context->std_error, socket_fd);
 665
 666                 /* This expects the input and output are already set up */
 667
 668                 /* Don't change the stderr file descriptor if we inherit all
 669                  * the way and are not on a tty */
 670                 if (e == EXEC_OUTPUT_INHERIT &&
 671                     o == EXEC_OUTPUT_INHERIT &&
 672                     i == EXEC_INPUT_NULL &&
 673                     !is_terminal_input(context->std_input) &&
 674                     getppid() != 1)
 675                         return fileno;
 676
 677                 /* Duplicate from stdout if possible */
 678                 if (can_inherit_stderr_from_stdout(context, o, e))
 679                         return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
 680
 681                 o = e;
 682
 683         } else if (o == EXEC_OUTPUT_INHERIT) {
 684                 /* If input got downgraded, inherit the original value */
 685                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 686                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 687
 688                 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
 689                 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
 690                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 691
 692                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 693                 if (getppid() != 1)
 694                         return fileno;
 695
 696                 /* We need to open /dev/null here anew, to get the right access mode. */
 697                 return open_null_as(O_WRONLY, fileno);
 698         }
 699
 700         switch (o) {
 701
 702         case EXEC_OUTPUT_NULL:
 703                 return open_null_as(O_WRONLY, fileno);
 704
 705         case EXEC_OUTPUT_TTY:
 706                 if (is_terminal_input(i))
 707                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 708
 709                 /* We don't reset the terminal if this is just about output */
 710                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 711
 712         case EXEC_OUTPUT_KMSG:
 713         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 714         case EXEC_OUTPUT_JOURNAL:
 715         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 716                 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
 717                 if (r < 0) {
 718                         log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m",
 719                                                fileno == STDOUT_FILENO ? "stdout" : "stderr");
 720                         r = open_null_as(O_WRONLY, fileno);
 721                 } else {
 722                         struct stat st;
 723
 724                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 725                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 726                          * services to detect whether they are connected to the journal or not.
 727                          *
 728                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
 729                          * about STDERR as that's usually the best way to do logging. */
 730
 731                         if (fstat(fileno, &st) >= 0 &&
 732                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
 733                                 *journal_stream_dev = st.st_dev;
 734                                 *journal_stream_ino = st.st_ino;
 735                         }
 736                 }
 737                 return r;
 738
 739         case EXEC_OUTPUT_SOCKET:
 740                 assert(socket_fd >= 0);
 741
 742                 return RET_NERRNO(dup2(socket_fd, fileno));
 743
 744         case EXEC_OUTPUT_NAMED_FD:
 745                 assert(named_iofds[fileno] >= 0);
 746
 747                 (void) fd_nonblock(named_iofds[fileno], false);
 748                 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
 749
 750         case EXEC_OUTPUT_FILE:
 751         case EXEC_OUTPUT_FILE_APPEND:
 752         case EXEC_OUTPUT_FILE_TRUNCATE: {
 753                 bool rw;
 754                 int fd, flags;
 755
 756                 assert(context->stdio_file[fileno]);
 757
 758                 rw = context->std_input == EXEC_INPUT_FILE &&
 759                         streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
 760
 761                 if (rw)
 762                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 763
 764                 flags = O_WRONLY;
 765                 if (o == EXEC_OUTPUT_FILE_APPEND)
 766                         flags |= O_APPEND;
 767                 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
 768                         flags |= O_TRUNC;
 769
 770                 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
 771                 if (fd < 0)
 772                         return fd;
 773
 774                 return move_fd(fd, fileno, 0);
 775         }
 776
 777         default:
 778                 assert_not_reached();
 779         }
 780 }
 781
 782 static int chown_terminal(int fd, uid_t uid) {
 783         int r;
 784
 785         assert(fd >= 0);
 786
 787         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 788         if (isatty(fd) < 1) {
 789                 if (IN_SET(errno, EINVAL, ENOTTY))
 790                         return 0; /* not a tty */
 791
 792                 return -errno;
 793         }
 794
 795         /* This might fail. What matters are the results. */
 796         r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
 797         if (r < 0)
 798                 return r;
 799
 800         return 1;
 801 }
 802
 803 static int setup_confirm_stdio(
 804                 const ExecContext *context,
 805                 const char *vc,
 806                 int *ret_saved_stdin,
 807                 int *ret_saved_stdout) {
 808
 809         _cleanup_close_ int fd = -EBADF, saved_stdin = -EBADF, saved_stdout = -EBADF;
 810         unsigned rows, cols;
 811         int r;
 812
 813         assert(ret_saved_stdin);
 814         assert(ret_saved_stdout);
 815
 816         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 817         if (saved_stdin < 0)
 818                 return -errno;
 819
 820         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
 821         if (saved_stdout < 0)
 822                 return -errno;
 823
 824         fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
 825         if (fd < 0)
 826                 return fd;
 827
 828         r = chown_terminal(fd, getuid());
 829         if (r < 0)
 830                 return r;
 831
 832         r = reset_terminal_fd(fd, true);
 833         if (r < 0)
 834                 return r;
 835
 836         r = exec_context_tty_size(context, &rows, &cols);
 837         if (r < 0)
 838                 return r;
 839
 840         r = terminal_set_size_fd(fd, vc, rows, cols);
 841         if (r < 0)
 842                 return r;
 843
 844         r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
 845         TAKE_FD(fd);
 846         if (r < 0)
 847                 return r;
 848
 849         *ret_saved_stdin = TAKE_FD(saved_stdin);
 850         *ret_saved_stdout = TAKE_FD(saved_stdout);
 851         return 0;
 852 }
 853
 854 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
 855         assert(err < 0);
 856
 857         if (err == -ETIMEDOUT)
 858                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
 859         else {
 860                 errno = -err;
 861                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
 862         }
 863 }
 864
 865 static void write_confirm_error(int err, const char *vc, const Unit *u) {
 866         _cleanup_close_ int fd = -EBADF;
 867
 868         assert(vc);
 869
 870         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 871         if (fd < 0)
 872                 return;
 873
 874         write_confirm_error_fd(err, fd, u);
 875 }
 876
 877 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 878         int r = 0;
 879
 880         assert(saved_stdin);
 881         assert(saved_stdout);
 882
 883         release_terminal();
 884
 885         if (*saved_stdin >= 0)
 886                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 887                         r = -errno;
 888
 889         if (*saved_stdout >= 0)
 890                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 891                         r = -errno;
 892
 893         *saved_stdin = safe_close(*saved_stdin);
 894         *saved_stdout = safe_close(*saved_stdout);
 895
 896         return r;
 897 }
 898
 899 enum {
 900         CONFIRM_PRETEND_FAILURE = -1,
 901         CONFIRM_PRETEND_SUCCESS =  0,
 902         CONFIRM_EXECUTE = 1,
 903 };
 904
 905 static int ask_for_confirmation(const ExecContext *context, const char *vc, Unit *u, const char *cmdline) {
 906         int saved_stdout = -1, saved_stdin = -1, r;
 907         _cleanup_free_ char *e = NULL;
 908         char c;
 909
 910         /* For any internal errors, assume a positive response. */
 911         r = setup_confirm_stdio(context, vc, &saved_stdin, &saved_stdout);
 912         if (r < 0) {
 913                 write_confirm_error(r, vc, u);
 914                 return CONFIRM_EXECUTE;
 915         }
 916
 917         /* confirm_spawn might have been disabled while we were sleeping. */
 918         if (manager_is_confirm_spawn_disabled(u->manager)) {
 919                 r = 1;
 920                 goto restore_stdio;
 921         }
 922
 923         e = ellipsize(cmdline, 60, 100);
 924         if (!e) {
 925                 log_oom();
 926                 r = CONFIRM_EXECUTE;
 927                 goto restore_stdio;
 928         }
 929
 930         for (;;) {
 931                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 932                 if (r < 0) {
 933                         write_confirm_error_fd(r, STDOUT_FILENO, u);
 934                         r = CONFIRM_EXECUTE;
 935                         goto restore_stdio;
 936                 }
 937
 938                 switch (c) {
 939                 case 'c':
 940                         printf("Resuming normal execution.\n");
 941                         manager_disable_confirm_spawn();
 942                         r = 1;
 943                         break;
 944                 case 'D':
 945                         unit_dump(u, stdout, "  ");
 946                         continue; /* ask again */
 947                 case 'f':
 948                         printf("Failing execution.\n");
 949                         r = CONFIRM_PRETEND_FAILURE;
 950                         break;
 951                 case 'h':
 952                         printf("  c - continue, proceed without asking anymore\n"
 953                                "  D - dump, show the state of the unit\n"
 954                                "  f - fail, don't execute the command and pretend it failed\n"
 955                                "  h - help\n"
 956                                "  i - info, show a short summary of the unit\n"
 957                                "  j - jobs, show jobs that are in progress\n"
 958                                "  s - skip, don't execute the command and pretend it succeeded\n"
 959                                "  y - yes, execute the command\n");
 960                         continue; /* ask again */
 961                 case 'i':
 962                         printf("  Description: %s\n"
 963                                "  Unit:        %s\n"
 964                                "  Command:     %s\n",
 965                                u->id, u->description, cmdline);
 966                         continue; /* ask again */
 967                 case 'j':
 968                         manager_dump_jobs(u->manager, stdout, /* patterns= */ NULL, "  ");
 969                         continue; /* ask again */
 970                 case 'n':
 971                         /* 'n' was removed in favor of 'f'. */
 972                         printf("Didn't understand 'n', did you mean 'f'?\n");
 973                         continue; /* ask again */
 974                 case 's':
 975                         printf("Skipping execution.\n");
 976                         r = CONFIRM_PRETEND_SUCCESS;
 977                         break;
 978                 case 'y':
 979                         r = CONFIRM_EXECUTE;
 980                         break;
 981                 default:
 982                         assert_not_reached();
 983                 }
 984                 break;
 985         }
 986
 987 restore_stdio:
 988         restore_confirm_stdio(&saved_stdin, &saved_stdout);
 989         return r;
 990 }
 991
 992 static int get_fixed_user(
 993                 const char *username,
 994                 const char **ret_user,
 995                 uid_t *ret_uid,
 996                 gid_t *ret_gid,
 997                 const char **ret_home,
 998                 const char **ret_shell) {
 999
1000         int r;
1001
1002         assert(username);
1003         assert(ret_user);
1004
1005         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
1006          * (i.e. are "/" or "/bin/nologin"). */
1007
1008         r = get_user_creds(&username, ret_uid, ret_gid, ret_home, ret_shell, USER_CREDS_CLEAN);
1009         if (r < 0)
1010                 return r;
1011
1012         *ret_user = username;
1013         return 0;
1014 }
1015
1016 static int get_fixed_group(
1017                 const char *groupname,
1018                 const char **ret_group,
1019                 gid_t *ret_gid) {
1020
1021         int r;
1022
1023         assert(groupname);
1024         assert(ret_group);
1025
1026         r = get_group_creds(&groupname, ret_gid, /* flags = */ 0);
1027         if (r < 0)
1028                 return r;
1029
1030         *ret_group = groupname;
1031         return 0;
1032 }
1033
1034 static int get_supplementary_groups(const ExecContext *c, const char *user,
1035                                     const char *group, gid_t gid,
1036                                     gid_t **supplementary_gids, int *ngids) {
1037         int r, k = 0;
1038         int ngroups_max;
1039         bool keep_groups = false;
1040         gid_t *groups = NULL;
1041         _cleanup_free_ gid_t *l_gids = NULL;
1042
1043         assert(c);
1044
1045         /*
1046          * If user is given, then lookup GID and supplementary groups list.
1047          * We avoid NSS lookups for gid=0. Also we have to initialize groups
1048          * here and as early as possible so we keep the list of supplementary
1049          * groups of the caller.
1050          */
1051         if (user && gid_is_valid(gid) && gid != 0) {
1052                 /* First step, initialize groups from /etc/groups */
1053                 if (initgroups(user, gid) < 0)
1054                         return -errno;
1055
1056                 keep_groups = true;
1057         }
1058
1059         if (strv_isempty(c->supplementary_groups))
1060                 return 0;
1061
1062         /*
1063          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1064          * be positive, otherwise fail.
1065          */
1066         errno = 0;
1067         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1068         if (ngroups_max <= 0)
1069                 return errno_or_else(EOPNOTSUPP);
1070
1071         l_gids = new(gid_t, ngroups_max);
1072         if (!l_gids)
1073                 return -ENOMEM;
1074
1075         if (keep_groups) {
1076                 /*
1077                  * Lookup the list of groups that the user belongs to, we
1078                  * avoid NSS lookups here too for gid=0.
1079                  */
1080                 k = ngroups_max;
1081                 if (getgrouplist(user, gid, l_gids, &k) < 0)
1082                         return -EINVAL;
1083         } else
1084                 k = 0;
1085
1086         STRV_FOREACH(i, c->supplementary_groups) {
1087                 const char *g;
1088
1089                 if (k >= ngroups_max)
1090                         return -E2BIG;
1091
1092                 g = *i;
1093                 r = get_group_creds(&g, l_gids+k, 0);
1094                 if (r < 0)
1095                         return r;
1096
1097                 k++;
1098         }
1099
1100         /*
1101          * Sets ngids to zero to drop all supplementary groups, happens
1102          * when we are under root and SupplementaryGroups= is empty.
1103          */
1104         if (k == 0) {
1105                 *ngids = 0;
1106                 return 0;
1107         }
1108
1109         /* Otherwise get the final list of supplementary groups */
1110         groups = memdup(l_gids, sizeof(gid_t) * k);
1111         if (!groups)
1112                 return -ENOMEM;
1113
1114         *supplementary_gids = groups;
1115         *ngids = k;
1116
1117         groups = NULL;
1118
1119         return 0;
1120 }
1121
1122 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1123         int r;
1124
1125         /* Handle SupplementaryGroups= if it is not empty */
1126         if (ngids > 0) {
1127                 r = maybe_setgroups(ngids, supplementary_gids);
1128                 if (r < 0)
1129                         return r;
1130         }
1131
1132         if (gid_is_valid(gid)) {
1133                 /* Then set our gids */
1134                 if (setresgid(gid, gid, gid) < 0)
1135                         return -errno;
1136         }
1137
1138         return 0;
1139 }
1140
1141 static int set_securebits(unsigned bits, unsigned mask) {
1142         unsigned applied;
1143         int current;
1144
1145         current = prctl(PR_GET_SECUREBITS);
1146         if (current < 0)
1147                 return -errno;
1148
1149         /* Clear all securebits defined in mask and set bits */
1150         applied = ((unsigned) current & ~mask) | bits;
1151         if ((unsigned) current == applied)
1152                 return 0;
1153
1154         if (prctl(PR_SET_SECUREBITS, applied) < 0)
1155                 return -errno;
1156
1157         return 1;
1158 }
1159
1160 static int enforce_user(
1161                 const ExecContext *context,
1162                 uid_t uid,
1163                 uint64_t capability_ambient_set) {
1164         assert(context);
1165         int r;
1166
1167         if (!uid_is_valid(uid))
1168                 return 0;
1169
1170         /* Sets (but doesn't look up) the UIS and makes sure we keep the capabilities while doing so. For
1171          * setting secure bits the capability CAP_SETPCAP is required, so we also need keep-caps in this
1172          * case. */
1173
1174         if ((capability_ambient_set != 0 || context->secure_bits != 0) && uid != 0) {
1175
1176                 /* First step: If we need to keep capabilities but drop privileges we need to make sure we
1177                  * keep our caps, while we drop privileges. Add KEEP_CAPS to the securebits */
1178                 r = set_securebits(1U << SECURE_KEEP_CAPS, 0);
1179                 if (r < 0)
1180                         return r;
1181         }
1182
1183         /* Second step: actually set the uids */
1184         if (setresuid(uid, uid, uid) < 0)
1185                 return -errno;
1186
1187         /* At this point we should have all necessary capabilities but are otherwise a normal user. However,
1188          * the caps might got corrupted due to the setresuid() so we need clean them up later. This is done
1189          * outside of this call. */
1190         return 0;
1191 }
1192
1193 #if HAVE_PAM
1194
1195 static int null_conv(
1196                 int num_msg,
1197                 const struct pam_message **msg,
1198                 struct pam_response **resp,
1199                 void *appdata_ptr) {
1200
1201         /* We don't support conversations */
1202
1203         return PAM_CONV_ERR;
1204 }
1205
1206 #endif
1207
1208 static int setup_pam(
1209                 const char *name,
1210                 const char *user,
1211                 uid_t uid,
1212                 gid_t gid,
1213                 const char *tty,
1214                 char ***env, /* updated on success */
1215                 const int fds[], size_t n_fds) {
1216
1217 #if HAVE_PAM
1218
1219         static const struct pam_conv conv = {
1220                 .conv = null_conv,
1221                 .appdata_ptr = NULL
1222         };
1223
1224         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1225         _cleanup_strv_free_ char **e = NULL;
1226         pam_handle_t *handle = NULL;
1227         sigset_t old_ss;
1228         int pam_code = PAM_SUCCESS, r;
1229         bool close_session = false;
1230         pid_t pam_pid = 0, parent_pid;
1231         int flags = 0;
1232
1233         assert(name);
1234         assert(user);
1235         assert(env);
1236
1237         /* We set up PAM in the parent process, then fork. The child
1238          * will then stay around until killed via PR_GET_PDEATHSIG or
1239          * systemd via the cgroup logic. It will then remove the PAM
1240          * session again. The parent process will exec() the actual
1241          * daemon. We do things this way to ensure that the main PID
1242          * of the daemon is the one we initially fork()ed. */
1243
1244         r = barrier_create(&barrier);
1245         if (r < 0)
1246                 goto fail;
1247
1248         if (log_get_max_level() < LOG_DEBUG)
1249                 flags |= PAM_SILENT;
1250
1251         pam_code = pam_start(name, user, &conv, &handle);
1252         if (pam_code != PAM_SUCCESS) {
1253                 handle = NULL;
1254                 goto fail;
1255         }
1256
1257         if (!tty) {
1258                 _cleanup_free_ char *q = NULL;
1259
1260                 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1261                  * out if that's the case, and read the TTY off it. */
1262
1263                 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1264                         tty = strjoina("/dev/", q);
1265         }
1266
1267         if (tty) {
1268                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1269                 if (pam_code != PAM_SUCCESS)
1270                         goto fail;
1271         }
1272
1273         STRV_FOREACH(nv, *env) {
1274                 pam_code = pam_putenv(handle, *nv);
1275                 if (pam_code != PAM_SUCCESS)
1276                         goto fail;
1277         }
1278
1279         pam_code = pam_acct_mgmt(handle, flags);
1280         if (pam_code != PAM_SUCCESS)
1281                 goto fail;
1282
1283         pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1284         if (pam_code != PAM_SUCCESS)
1285                 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
1286
1287         pam_code = pam_open_session(handle, flags);
1288         if (pam_code != PAM_SUCCESS)
1289                 goto fail;
1290
1291         close_session = true;
1292
1293         e = pam_getenvlist(handle);
1294         if (!e) {
1295                 pam_code = PAM_BUF_ERR;
1296                 goto fail;
1297         }
1298
1299         /* Block SIGTERM, so that we know that it won't get lost in the child */
1300
1301         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1302
1303         parent_pid = getpid_cached();
1304
1305         r = safe_fork("(sd-pam)", 0, &pam_pid);
1306         if (r < 0)
1307                 goto fail;
1308         if (r == 0) {
1309                 int sig, ret = EXIT_PAM;
1310
1311                 /* The child's job is to reset the PAM session on termination */
1312                 barrier_set_role(&barrier, BARRIER_CHILD);
1313
1314                 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1315                  * those fds are open here that have been opened by PAM. */
1316                 (void) close_many(fds, n_fds);
1317
1318                 /* Drop privileges - we don't need any to pam_close_session and this will make
1319                  * PR_SET_PDEATHSIG work in most cases.  If this fails, ignore the error - but expect sd-pam
1320                  * threads to fail to exit normally */
1321
1322                 r = maybe_setgroups(0, NULL);
1323                 if (r < 0)
1324                         log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1325                 if (setresgid(gid, gid, gid) < 0)
1326                         log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1327                 if (setresuid(uid, uid, uid) < 0)
1328                         log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1329
1330                 (void) ignore_signals(SIGPIPE);
1331
1332                 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1333                  * otherwise the kernel will not allow unprivileged parents kill their privileged children
1334                  * this way. We rely on the control groups kill logic to do the rest for us. */
1335                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1336                         goto child_finish;
1337
1338                 /* Tell the parent that our setup is done. This is especially important regarding dropping
1339                  * privileges. Otherwise, unit setup might race against our setresuid(2) call.
1340                  *
1341                  * If the parent aborted, we'll detect this below, hence ignore return failure here. */
1342                 (void) barrier_place(&barrier);
1343
1344                 /* Check if our parent process might already have died? */
1345                 if (getppid() == parent_pid) {
1346                         sigset_t ss;
1347
1348                         assert_se(sigemptyset(&ss) >= 0);
1349                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1350
1351                         for (;;) {
1352                                 if (sigwait(&ss, &sig) < 0) {
1353                                         if (errno == EINTR)
1354                                                 continue;
1355
1356                                         goto child_finish;
1357                                 }
1358
1359                                 assert(sig == SIGTERM);
1360                                 break;
1361                         }
1362                 }
1363
1364                 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1365                 if (pam_code != PAM_SUCCESS)
1366                         goto child_finish;
1367
1368                 /* If our parent died we'll end the session */
1369                 if (getppid() != parent_pid) {
1370                         pam_code = pam_close_session(handle, flags);
1371                         if (pam_code != PAM_SUCCESS)
1372                                 goto child_finish;
1373                 }
1374
1375                 ret = 0;
1376
1377         child_finish:
1378                 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1379                  * know about this. See pam_end(3) */
1380                 (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
1381                 _exit(ret);
1382         }
1383
1384         barrier_set_role(&barrier, BARRIER_PARENT);
1385
1386         /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1387          * here. */
1388         handle = NULL;
1389
1390         /* Unblock SIGTERM again in the parent */
1391         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1392
1393         /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1394          * this fd around. */
1395         closelog();
1396
1397         /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1398          * recover. However, warn loudly if it happens. */
1399         if (!barrier_place_and_sync(&barrier))
1400                 log_error("PAM initialization failed");
1401
1402         return strv_free_and_replace(*env, e);
1403
1404 fail:
1405         if (pam_code != PAM_SUCCESS) {
1406                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1407                 r = -EPERM;  /* PAM errors do not map to errno */
1408         } else
1409                 log_error_errno(r, "PAM failed: %m");
1410
1411         if (handle) {
1412                 if (close_session)
1413                         pam_code = pam_close_session(handle, flags);
1414
1415                 (void) pam_end(handle, pam_code | flags);
1416         }
1417
1418         closelog();
1419         return r;
1420 #else
1421         return 0;
1422 #endif
1423 }
1424
1425 static void rename_process_from_path(const char *path) {
1426         _cleanup_free_ char *buf = NULL;
1427         const char *p;
1428
1429         assert(path);
1430
1431         /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
1432          * /bin/ps */
1433
1434         if (path_extract_filename(path, &buf) < 0) {
1435                 rename_process("(...)");
1436                 return;
1437         }
1438
1439         size_t l = strlen(buf);
1440         if (l > 8) {
1441                 /* The end of the process name is usually more interesting, since the first bit might just be
1442                  * "systemd-" */
1443                 p = buf + l - 8;
1444                 l = 8;
1445         } else
1446                 p = buf;
1447
1448         char process_name[11];
1449         process_name[0] = '(';
1450         memcpy(process_name+1, p, l);
1451         process_name[1+l] = ')';
1452         process_name[1+l+1] = 0;
1453
1454         rename_process(process_name);
1455 }
1456
1457 static bool context_has_address_families(const ExecContext *c) {
1458         assert(c);
1459
1460         return c->address_families_allow_list ||
1461                 !set_isempty(c->address_families);
1462 }
1463
1464 static bool context_has_syscall_filters(const ExecContext *c) {
1465         assert(c);
1466
1467         return c->syscall_allow_list ||
1468                 !hashmap_isempty(c->syscall_filter);
1469 }
1470
1471 static bool context_has_syscall_logs(const ExecContext *c) {
1472         assert(c);
1473
1474         return c->syscall_log_allow_list ||
1475                 !hashmap_isempty(c->syscall_log);
1476 }
1477
1478 static bool context_has_no_new_privileges(const ExecContext *c) {
1479         assert(c);
1480
1481         if (c->no_new_privileges)
1482                 return true;
1483
1484         if (have_effective_cap(CAP_SYS_ADMIN) > 0) /* if we are privileged, we don't need NNP */
1485                 return false;
1486
1487         /* We need NNP if we have any form of seccomp and are unprivileged */
1488         return c->lock_personality ||
1489                 c->memory_deny_write_execute ||
1490                 c->private_devices ||
1491                 c->protect_clock ||
1492                 c->protect_hostname ||
1493                 c->protect_kernel_tunables ||
1494                 c->protect_kernel_modules ||
1495                 c->protect_kernel_logs ||
1496                 context_has_address_families(c) ||
1497                 exec_context_restrict_namespaces_set(c) ||
1498                 c->restrict_realtime ||
1499                 c->restrict_suid_sgid ||
1500                 !set_isempty(c->syscall_archs) ||
1501                 context_has_syscall_filters(c) ||
1502                 context_has_syscall_logs(c);
1503 }
1504
1505 #if HAVE_SECCOMP
1506
1507 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1508
1509         if (is_seccomp_available())
1510                 return false;
1511
1512         log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1513         return true;
1514 }
1515
1516 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1517         uint32_t negative_action, default_action, action;
1518         int r;
1519
1520         assert(u);
1521         assert(c);
1522
1523         if (!context_has_syscall_filters(c))
1524                 return 0;
1525
1526         if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1527                 return 0;
1528
1529         negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1530
1531         if (c->syscall_allow_list) {
1532                 default_action = negative_action;
1533                 action = SCMP_ACT_ALLOW;
1534         } else {
1535                 default_action = SCMP_ACT_ALLOW;
1536                 action = negative_action;
1537         }
1538
1539         if (needs_ambient_hack) {
1540                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1541                 if (r < 0)
1542                         return r;
1543         }
1544
1545         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1546 }
1547
1548 static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1549 #ifdef SCMP_ACT_LOG
1550         uint32_t default_action, action;
1551 #endif
1552
1553         assert(u);
1554         assert(c);
1555
1556         if (!context_has_syscall_logs(c))
1557                 return 0;
1558
1559 #ifdef SCMP_ACT_LOG
1560         if (skip_seccomp_unavailable(u, "SystemCallLog="))
1561                 return 0;
1562
1563         if (c->syscall_log_allow_list) {
1564                 /* Log nothing but the ones listed */
1565                 default_action = SCMP_ACT_ALLOW;
1566                 action = SCMP_ACT_LOG;
1567         } else {
1568                 /* Log everything but the ones listed */
1569                 default_action = SCMP_ACT_LOG;
1570                 action = SCMP_ACT_ALLOW;
1571         }
1572
1573         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1574 #else
1575         /* old libseccomp */
1576         log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1577         return 0;
1578 #endif
1579 }
1580
1581 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1582         assert(u);
1583         assert(c);
1584
1585         if (set_isempty(c->syscall_archs))
1586                 return 0;
1587
1588         if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1589                 return 0;
1590
1591         return seccomp_restrict_archs(c->syscall_archs);
1592 }
1593
1594 static int apply_address_families(const Unit* u, const ExecContext *c) {
1595         assert(u);
1596         assert(c);
1597
1598         if (!context_has_address_families(c))
1599                 return 0;
1600
1601         if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1602                 return 0;
1603
1604         return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1605 }
1606
1607 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1608         int r;
1609
1610         assert(u);
1611         assert(c);
1612
1613         if (!c->memory_deny_write_execute)
1614                 return 0;
1615
1616         /* use prctl() if kernel supports it (6.3) */
1617         r = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0, 0, 0);
1618         if (r == 0) {
1619                 log_unit_debug(u, "Enabled MemoryDenyWriteExecute= with PR_SET_MDWE");
1620                 return 0;
1621         }
1622         if (r < 0 && errno != EINVAL)
1623                 return log_unit_debug_errno(u, errno, "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m");
1624         /* else use seccomp */
1625         log_unit_debug(u, "Kernel doesn't support PR_SET_MDWE: falling back to seccomp");
1626
1627         if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1628                 return 0;
1629
1630         return seccomp_memory_deny_write_execute();
1631 }
1632
1633 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1634         assert(u);
1635         assert(c);
1636
1637         if (!c->restrict_realtime)
1638                 return 0;
1639
1640         if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1641                 return 0;
1642
1643         return seccomp_restrict_realtime();
1644 }
1645
1646 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1647         assert(u);
1648         assert(c);
1649
1650         if (!c->restrict_suid_sgid)
1651                 return 0;
1652
1653         if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1654                 return 0;
1655
1656         return seccomp_restrict_suid_sgid();
1657 }
1658
1659 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1660         assert(u);
1661         assert(c);
1662
1663         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1664          * let's protect even those systems where this is left on in the kernel. */
1665
1666         if (!c->protect_kernel_tunables)
1667                 return 0;
1668
1669         if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1670                 return 0;
1671
1672         return seccomp_protect_sysctl();
1673 }
1674
1675 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1676         assert(u);
1677         assert(c);
1678
1679         /* Turn off module syscalls on ProtectKernelModules=yes */
1680
1681         if (!c->protect_kernel_modules)
1682                 return 0;
1683
1684         if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1685                 return 0;
1686
1687         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1688 }
1689
1690 static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1691         assert(u);
1692         assert(c);
1693
1694         if (!c->protect_kernel_logs)
1695                 return 0;
1696
1697         if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1698                 return 0;
1699
1700         return seccomp_protect_syslog();
1701 }
1702
1703 static int apply_protect_clock(const Unit *u, const ExecContext *c) {
1704         assert(u);
1705         assert(c);
1706
1707         if (!c->protect_clock)
1708                 return 0;
1709
1710         if (skip_seccomp_unavailable(u, "ProtectClock="))
1711                 return 0;
1712
1713         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1714 }
1715
1716 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1717         assert(u);
1718         assert(c);
1719
1720         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1721
1722         if (!c->private_devices)
1723                 return 0;
1724
1725         if (skip_seccomp_unavailable(u, "PrivateDevices="))
1726                 return 0;
1727
1728         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1729 }
1730
1731 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1732         assert(u);
1733         assert(c);
1734
1735         if (!exec_context_restrict_namespaces_set(c))
1736                 return 0;
1737
1738         if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1739                 return 0;
1740
1741         return seccomp_restrict_namespaces(c->restrict_namespaces);
1742 }
1743
1744 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1745         unsigned long personality;
1746         int r;
1747
1748         assert(u);
1749         assert(c);
1750
1751         if (!c->lock_personality)
1752                 return 0;
1753
1754         if (skip_seccomp_unavailable(u, "LockPersonality="))
1755                 return 0;
1756
1757         personality = c->personality;
1758
1759         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1760         if (personality == PERSONALITY_INVALID) {
1761
1762                 r = opinionated_personality(&personality);
1763                 if (r < 0)
1764                         return r;
1765         }
1766
1767         return seccomp_lock_personality(personality);
1768 }
1769
1770 #endif
1771
1772 #if HAVE_LIBBPF
1773 static int apply_restrict_filesystems(Unit *u, const ExecContext *c) {
1774         assert(u);
1775         assert(c);
1776
1777         if (!exec_context_restrict_filesystems_set(c))
1778                 return 0;
1779
1780         if (!u->manager->restrict_fs) {
1781                 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1782                 log_unit_debug(u, "LSM BPF not supported, skipping RestrictFileSystems=");
1783                 return 0;
1784         }
1785
1786         return lsm_bpf_unit_restrict_filesystems(u, c->restrict_filesystems, c->restrict_filesystems_allow_list);
1787 }
1788 #endif
1789
1790 static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
1791         assert(u);
1792         assert(c);
1793
1794         if (!c->protect_hostname)
1795                 return 0;
1796
1797         if (ns_type_supported(NAMESPACE_UTS)) {
1798                 if (unshare(CLONE_NEWUTS) < 0) {
1799                         if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1800                                 *ret_exit_status = EXIT_NAMESPACE;
1801                                 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1802                         }
1803
1804                         log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1805                 }
1806         } else
1807                 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1808
1809 #if HAVE_SECCOMP
1810         int r;
1811
1812         if (skip_seccomp_unavailable(u, "ProtectHostname="))
1813                 return 0;
1814
1815         r = seccomp_protect_hostname();
1816         if (r < 0) {
1817                 *ret_exit_status = EXIT_SECCOMP;
1818                 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1819         }
1820 #endif
1821
1822         return 0;
1823 }
1824
1825 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1826         assert(idle_pipe);
1827
1828         idle_pipe[1] = safe_close(idle_pipe[1]);
1829         idle_pipe[2] = safe_close(idle_pipe[2]);
1830
1831         if (idle_pipe[0] >= 0) {
1832                 int r;
1833
1834                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1835
1836                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1837                         ssize_t n;
1838
1839                         /* Signal systemd that we are bored and want to continue. */
1840                         n = write(idle_pipe[3], "x", 1);
1841                         if (n > 0)
1842                                 /* Wait for systemd to react to the signal above. */
1843                                 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1844                 }
1845
1846                 idle_pipe[0] = safe_close(idle_pipe[0]);
1847
1848         }
1849
1850         idle_pipe[3] = safe_close(idle_pipe[3]);
1851 }
1852
1853 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1854
1855 static int build_environment(
1856                 const Unit *u,
1857                 const ExecContext *c,
1858                 const ExecParameters *p,
1859                 const CGroupContext *cgroup_context,
1860                 size_t n_fds,
1861                 char **fdnames,
1862                 const char *home,
1863                 const char *username,
1864                 const char *shell,
1865                 dev_t journal_stream_dev,
1866                 ino_t journal_stream_ino,
1867                 const char *memory_pressure_path,
1868                 char ***ret) {
1869
1870         _cleanup_strv_free_ char **our_env = NULL;
1871         size_t n_env = 0;
1872         char *x;
1873         int r;
1874
1875         assert(u);
1876         assert(c);
1877         assert(p);
1878         assert(ret);
1879
1880 #define N_ENV_VARS 19
1881         our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1882         if (!our_env)
1883                 return -ENOMEM;
1884
1885         if (n_fds > 0) {
1886                 _cleanup_free_ char *joined = NULL;
1887
1888                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1889                         return -ENOMEM;
1890                 our_env[n_env++] = x;
1891
1892                 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1893                         return -ENOMEM;
1894                 our_env[n_env++] = x;
1895
1896                 joined = strv_join(fdnames, ":");
1897                 if (!joined)
1898                         return -ENOMEM;
1899
1900                 x = strjoin("LISTEN_FDNAMES=", joined);
1901                 if (!x)
1902                         return -ENOMEM;
1903                 our_env[n_env++] = x;
1904         }
1905
1906         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1907                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1908                         return -ENOMEM;
1909                 our_env[n_env++] = x;
1910
1911                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1912                         return -ENOMEM;
1913                 our_env[n_env++] = x;
1914         }
1915
1916         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1917          * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1918          * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1919         if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
1920                 x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
1921                 if (!x)
1922                         return -ENOMEM;
1923                 our_env[n_env++] = x;
1924         }
1925
1926         /* We query "root" if this is a system unit and User= is not specified. $USER is always set. $HOME
1927          * could cause problem for e.g. getty, since login doesn't override $HOME, and $LOGNAME and $SHELL don't
1928          * really make much sense since we're not logged in. Hence we conditionalize the three based on
1929          * SetLoginEnvironment= switch. */
1930         if (!c->user && !c->dynamic_user && p->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
1931                 r = get_fixed_user("root", &username, NULL, NULL, &home, &shell);
1932                 if (r < 0)
1933                         return log_unit_error_errno(u, r, "Failed to determine user credentials for root: %m");
1934         }
1935
1936         bool set_user_login_env = c->set_login_environment >= 0 ? c->set_login_environment : (c->user || c->dynamic_user);
1937
1938         if (username) {
1939                 x = strjoin("USER=", username);
1940                 if (!x)
1941                         return -ENOMEM;
1942                 our_env[n_env++] = x;
1943
1944                 if (set_user_login_env) {
1945                         x = strjoin("LOGNAME=", username);
1946                         if (!x)
1947                                 return -ENOMEM;
1948                         our_env[n_env++] = x;
1949                 }
1950         }
1951
1952         if (home && set_user_login_env) {
1953                 x = strjoin("HOME=", home);
1954                 if (!x)
1955                         return -ENOMEM;
1956
1957                 path_simplify(x + 5);
1958                 our_env[n_env++] = x;
1959         }
1960
1961         if (shell && set_user_login_env) {
1962                 x = strjoin("SHELL=", shell);
1963                 if (!x)
1964                         return -ENOMEM;
1965
1966                 path_simplify(x + 6);
1967                 our_env[n_env++] = x;
1968         }
1969
1970         if (!sd_id128_is_null(u->invocation_id)) {
1971                 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1972                         return -ENOMEM;
1973
1974                 our_env[n_env++] = x;
1975         }
1976
1977         if (exec_context_needs_term(c)) {
1978                 _cleanup_free_ char *cmdline = NULL;
1979                 const char *tty_path, *term = NULL;
1980
1981                 tty_path = exec_context_tty_path(c);
1982
1983                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1984                  * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1985                  * container manager passes to PID 1 ends up all the way in the console login shown. */
1986
1987                 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
1988                         term = getenv("TERM");
1989                 else if (tty_path && in_charset(skip_dev_prefix(tty_path), ALPHANUMERICAL)) {
1990                         _cleanup_free_ char *key = NULL;
1991
1992                         key = strjoin("systemd.tty.term.", skip_dev_prefix(tty_path));
1993                         if (!key)
1994                                 return -ENOMEM;
1995
1996                         r = proc_cmdline_get_key(key, 0, &cmdline);
1997                         if (r < 0)
1998                                 log_debug_errno(r, "Failed to read %s from kernel cmdline, ignoring: %m", key);
1999                         else if (r > 0)
2000                                 term = cmdline;
2001                 }
2002
2003                 if (!term)
2004                         term = default_term_for_tty(tty_path);
2005
2006                 x = strjoin("TERM=", term);
2007                 if (!x)
2008                         return -ENOMEM;
2009                 our_env[n_env++] = x;
2010         }
2011
2012         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
2013                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
2014                         return -ENOMEM;
2015
2016                 our_env[n_env++] = x;
2017         }
2018
2019         if (c->log_namespace) {
2020                 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
2021                 if (!x)
2022                         return -ENOMEM;
2023
2024                 our_env[n_env++] = x;
2025         }
2026
2027         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2028                 _cleanup_free_ char *joined = NULL;
2029                 const char *n;
2030
2031                 if (!p->prefix[t])
2032                         continue;
2033
2034                 if (c->directories[t].n_items == 0)
2035                         continue;
2036
2037                 n = exec_directory_env_name_to_string(t);
2038                 if (!n)
2039                         continue;
2040
2041                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
2042                         _cleanup_free_ char *prefixed = NULL;
2043
2044                         prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
2045                         if (!prefixed)
2046                                 return -ENOMEM;
2047
2048                         if (!strextend_with_separator(&joined, ":", prefixed))
2049                                 return -ENOMEM;
2050                 }
2051
2052                 x = strjoin(n, "=", joined);
2053                 if (!x)
2054                         return -ENOMEM;
2055
2056                 our_env[n_env++] = x;
2057         }
2058
2059         _cleanup_free_ char *creds_dir = NULL;
2060         r = exec_context_get_credential_directory(c, p, u->id, &creds_dir);
2061         if (r < 0)
2062                 return r;
2063         if (r > 0) {
2064                 x = strjoin("CREDENTIALS_DIRECTORY=", creds_dir);
2065                 if (!x)
2066                         return -ENOMEM;
2067
2068                 our_env[n_env++] = x;
2069         }
2070
2071         if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
2072                 return -ENOMEM;
2073
2074         our_env[n_env++] = x;
2075
2076         if (memory_pressure_path) {
2077                 x = strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path);
2078                 if (!x)
2079                         return -ENOMEM;
2080
2081                 our_env[n_env++] = x;
2082
2083                 if (cgroup_context && !path_equal(memory_pressure_path, "/dev/null")) {
2084                         _cleanup_free_ char *b = NULL, *e = NULL;
2085
2086                         if (asprintf(&b, "%s " USEC_FMT " " USEC_FMT,
2087                                      MEMORY_PRESSURE_DEFAULT_TYPE,
2088                                      cgroup_context->memory_pressure_threshold_usec == USEC_INFINITY ? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC :
2089                                      CLAMP(cgroup_context->memory_pressure_threshold_usec, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC),
2090                                      MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2091                                 return -ENOMEM;
2092
2093                         if (base64mem(b, strlen(b) + 1, &e) < 0)
2094                                 return -ENOMEM;
2095
2096                         x = strjoin("MEMORY_PRESSURE_WRITE=", e);
2097                         if (!x)
2098                                 return -ENOMEM;
2099
2100                         our_env[n_env++] = x;
2101                 }
2102         }
2103
2104         assert(n_env < N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
2105 #undef N_ENV_VARS
2106
2107         *ret = TAKE_PTR(our_env);
2108
2109         return 0;
2110 }
2111
2112 static int build_pass_environment(const ExecContext *c, char ***ret) {
2113         _cleanup_strv_free_ char **pass_env = NULL;
2114         size_t n_env = 0;
2115
2116         STRV_FOREACH(i, c->pass_environment) {
2117                 _cleanup_free_ char *x = NULL;
2118                 char *v;
2119
2120                 v = getenv(*i);
2121                 if (!v)
2122                         continue;
2123                 x = strjoin(*i, "=", v);
2124                 if (!x)
2125                         return -ENOMEM;
2126
2127                 if (!GREEDY_REALLOC(pass_env, n_env + 2))
2128                         return -ENOMEM;
2129
2130                 pass_env[n_env++] = TAKE_PTR(x);
2131                 pass_env[n_env] = NULL;
2132         }
2133
2134         *ret = TAKE_PTR(pass_env);
2135
2136         return 0;
2137 }
2138
2139 bool exec_needs_network_namespace(const ExecContext *context) {
2140         assert(context);
2141
2142         return context->private_network || context->network_namespace_path;
2143 }
2144
2145 static bool exec_needs_ephemeral(const ExecContext *context) {
2146         return (context->root_image || context->root_directory) && context->root_ephemeral;
2147 }
2148
2149 static bool exec_needs_ipc_namespace(const ExecContext *context) {
2150         assert(context);
2151
2152         return context->private_ipc || context->ipc_namespace_path;
2153 }
2154
2155 bool exec_needs_mount_namespace(
2156                 const ExecContext *context,
2157                 const ExecParameters *params,
2158                 const ExecRuntime *runtime) {
2159
2160         assert(context);
2161
2162         if (context->root_image)
2163                 return true;
2164
2165         if (!strv_isempty(context->read_write_paths) ||
2166             !strv_isempty(context->read_only_paths) ||
2167             !strv_isempty(context->inaccessible_paths) ||
2168             !strv_isempty(context->exec_paths) ||
2169             !strv_isempty(context->no_exec_paths))
2170                 return true;
2171
2172         if (context->n_bind_mounts > 0)
2173                 return true;
2174
2175         if (context->n_temporary_filesystems > 0)
2176                 return true;
2177
2178         if (context->n_mount_images > 0)
2179                 return true;
2180
2181         if (context->n_extension_images > 0)
2182                 return true;
2183
2184         if (!strv_isempty(context->extension_directories))
2185                 return true;
2186
2187         if (!IN_SET(context->mount_propagation_flag, 0, MS_SHARED))
2188                 return true;
2189
2190         if (context->private_tmp && runtime && runtime->shared && (runtime->shared->tmp_dir || runtime->shared->var_tmp_dir))
2191                 return true;
2192
2193         if (context->private_devices ||
2194             context->private_mounts > 0 ||
2195             (context->private_mounts < 0 && exec_needs_network_namespace(context)) ||
2196             context->protect_system != PROTECT_SYSTEM_NO ||
2197             context->protect_home != PROTECT_HOME_NO ||
2198             context->protect_kernel_tunables ||
2199             context->protect_kernel_modules ||
2200             context->protect_kernel_logs ||
2201             context->protect_control_groups ||
2202             context->protect_proc != PROTECT_PROC_DEFAULT ||
2203             context->proc_subset != PROC_SUBSET_ALL ||
2204             exec_needs_ipc_namespace(context))
2205                 return true;
2206
2207         if (context->root_directory) {
2208                 if (exec_context_get_effective_mount_apivfs(context))
2209                         return true;
2210
2211                 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2212                         if (params && !params->prefix[t])
2213                                 continue;
2214
2215                         if (context->directories[t].n_items > 0)
2216                                 return true;
2217                 }
2218         }
2219
2220         if (context->dynamic_user &&
2221             (context->directories[EXEC_DIRECTORY_STATE].n_items > 0 ||
2222              context->directories[EXEC_DIRECTORY_CACHE].n_items > 0 ||
2223              context->directories[EXEC_DIRECTORY_LOGS].n_items > 0))
2224                 return true;
2225
2226         if (context->log_namespace)
2227                 return true;
2228
2229         return false;
2230 }
2231
2232 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
2233         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2234         _cleanup_close_pair_ int errno_pipe[2] = PIPE_EBADF;
2235         _cleanup_close_ int unshare_ready_fd = -EBADF;
2236         _cleanup_(sigkill_waitp) pid_t pid = 0;
2237         uint64_t c = 1;
2238         ssize_t n;
2239         int r;
2240
2241         /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2242          * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2243          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2244          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2245          * which waits for the parent to create the new user namespace while staying in the original namespace. The
2246          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2247          * continues execution normally.
2248          * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2249          * does not need CAP_SETUID to write the single line mapping to itself. */
2250
2251         /* Can only set up multiple mappings with CAP_SETUID. */
2252         if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid))
2253                 r = asprintf(&uid_map,
2254                              UID_FMT " " UID_FMT " 1\n"     /* Map $OUID → $OUID */
2255                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
2256                              ouid, ouid, uid, uid);
2257         else
2258                 r = asprintf(&uid_map,
2259                              UID_FMT " " UID_FMT " 1\n",    /* Map $OUID → $OUID */
2260                              ouid, ouid);
2261
2262         if (r < 0)
2263                 return -ENOMEM;
2264
2265         /* Can only set up multiple mappings with CAP_SETGID. */
2266         if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid))
2267                 r = asprintf(&gid_map,
2268                              GID_FMT " " GID_FMT " 1\n"     /* Map $OGID → $OGID */
2269                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
2270                              ogid, ogid, gid, gid);
2271         else
2272                 r = asprintf(&gid_map,
2273                              GID_FMT " " GID_FMT " 1\n",    /* Map $OGID -> $OGID */
2274                              ogid, ogid);
2275
2276         if (r < 0)
2277                 return -ENOMEM;
2278
2279         /* Create a communication channel so that the parent can tell the child when it finished creating the user
2280          * namespace. */
2281         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2282         if (unshare_ready_fd < 0)
2283                 return -errno;
2284
2285         /* Create a communication channel so that the child can tell the parent a proper error code in case it
2286          * failed. */
2287         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2288                 return -errno;
2289
2290         r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2291         if (r < 0)
2292                 return r;
2293         if (r == 0) {
2294                 _cleanup_close_ int fd = -EBADF;
2295                 const char *a;
2296                 pid_t ppid;
2297
2298                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2299                  * here, after the parent opened its own user namespace. */
2300
2301                 ppid = getppid();
2302                 errno_pipe[0] = safe_close(errno_pipe[0]);
2303
2304                 /* Wait until the parent unshared the user namespace */
2305                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2306                         r = -errno;
2307                         goto child_fail;
2308                 }
2309
2310                 /* Disable the setgroups() system call in the child user namespace, for good. */
2311                 a = procfs_file_alloca(ppid, "setgroups");
2312                 fd = open(a, O_WRONLY|O_CLOEXEC);
2313                 if (fd < 0) {
2314                         if (errno != ENOENT) {
2315                                 r = -errno;
2316                                 goto child_fail;
2317                         }
2318
2319                         /* If the file is missing the kernel is too old, let's continue anyway. */
2320                 } else {
2321                         if (write(fd, "deny\n", 5) < 0) {
2322                                 r = -errno;
2323                                 goto child_fail;
2324                         }
2325
2326                         fd = safe_close(fd);
2327                 }
2328
2329                 /* First write the GID map */
2330                 a = procfs_file_alloca(ppid, "gid_map");
2331                 fd = open(a, O_WRONLY|O_CLOEXEC);
2332                 if (fd < 0) {
2333                         r = -errno;
2334                         goto child_fail;
2335                 }
2336                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2337                         r = -errno;
2338                         goto child_fail;
2339                 }
2340                 fd = safe_close(fd);
2341
2342                 /* The write the UID map */
2343                 a = procfs_file_alloca(ppid, "uid_map");
2344                 fd = open(a, O_WRONLY|O_CLOEXEC);
2345                 if (fd < 0) {
2346                         r = -errno;
2347                         goto child_fail;
2348                 }
2349                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2350                         r = -errno;
2351                         goto child_fail;
2352                 }
2353
2354                 _exit(EXIT_SUCCESS);
2355
2356         child_fail:
2357                 (void) write(errno_pipe[1], &r, sizeof(r));
2358                 _exit(EXIT_FAILURE);
2359         }
2360
2361         errno_pipe[1] = safe_close(errno_pipe[1]);
2362
2363         if (unshare(CLONE_NEWUSER) < 0)
2364                 return -errno;
2365
2366         /* Let the child know that the namespace is ready now */
2367         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2368                 return -errno;
2369
2370         /* Try to read an error code from the child */
2371         n = read(errno_pipe[0], &r, sizeof(r));
2372         if (n < 0)
2373                 return -errno;
2374         if (n == sizeof(r)) { /* an error code was sent to us */
2375                 if (r < 0)
2376                         return r;
2377                 return -EIO;
2378         }
2379         if (n != 0) /* on success we should have read 0 bytes */
2380                 return -EIO;
2381
2382         r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
2383         if (r < 0)
2384                 return r;
2385         if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2386                 return -EIO;
2387
2388         return 0;
2389 }
2390
2391 static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2392         assert(context);
2393
2394         if (!context->dynamic_user)
2395                 return false;
2396
2397         if (type == EXEC_DIRECTORY_CONFIGURATION)
2398                 return false;
2399
2400         if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2401                 return false;
2402
2403         return true;
2404 }
2405
2406 static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2407         _cleanup_free_ char *src_abs = NULL;
2408         int r;
2409
2410         assert(source);
2411
2412         src_abs = path_join(root, source);
2413         if (!src_abs)
2414                 return -ENOMEM;
2415
2416         STRV_FOREACH(dst, symlinks) {
2417                 _cleanup_free_ char *dst_abs = NULL;
2418
2419                 dst_abs = path_join(root, *dst);
2420                 if (!dst_abs)
2421                         return -ENOMEM;
2422
2423                 r = mkdir_parents_label(dst_abs, 0755);
2424                 if (r < 0)
2425                         return r;
2426
2427                 r = symlink_idempotent(src_abs, dst_abs, true);
2428                 if (r < 0)
2429                         return r;
2430         }
2431
2432         return 0;
2433 }
2434
2435 static int setup_exec_directory(
2436                 Unit *u,
2437                 const ExecContext *context,
2438                 const ExecParameters *params,
2439                 uid_t uid,
2440                 gid_t gid,
2441                 ExecDirectoryType type,
2442                 bool needs_mount_namespace,
2443                 int *exit_status) {
2444
2445         static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2446                 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2447                 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2448                 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2449                 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2450                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2451         };
2452         int r;
2453
2454         assert(context);
2455         assert(params);
2456         assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2457         assert(exit_status);
2458
2459         if (!params->prefix[type])
2460                 return 0;
2461
2462         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2463                 if (!uid_is_valid(uid))
2464                         uid = 0;
2465                 if (!gid_is_valid(gid))
2466                         gid = 0;
2467         }
2468
2469         for (size_t i = 0; i < context->directories[type].n_items; i++) {
2470                 _cleanup_free_ char *p = NULL, *pp = NULL;
2471
2472                 p = path_join(params->prefix[type], context->directories[type].items[i].path);
2473                 if (!p) {
2474                         r = -ENOMEM;
2475                         goto fail;
2476                 }
2477
2478                 r = mkdir_parents_label(p, 0755);
2479                 if (r < 0)
2480                         goto fail;
2481
2482                 if (IN_SET(type, EXEC_DIRECTORY_STATE, EXEC_DIRECTORY_LOGS) && params->runtime_scope == RUNTIME_SCOPE_USER) {
2483
2484                         /* If we are in user mode, and a configuration directory exists but a state directory
2485                          * doesn't exist, then we likely are upgrading from an older systemd version that
2486                          * didn't know the more recent addition to the xdg-basedir spec: the $XDG_STATE_HOME
2487                          * directory. In older systemd versions EXEC_DIRECTORY_STATE was aliased to
2488                          * EXEC_DIRECTORY_CONFIGURATION, with the advent of $XDG_STATE_HOME is is now
2489                          * separated. If a service has both dirs configured but only the configuration dir
2490                          * exists and the state dir does not, we assume we are looking at an update
2491                          * situation. Hence, create a compatibility symlink, so that all expectations are
2492                          * met.
2493                          *
2494                          * (We also do something similar with the log directory, which still doesn't exist in
2495                          * the xdg basedir spec. We'll make it a subdir of the state dir.) */
2496
2497                         /* this assumes the state dir is always created before the configuration dir */
2498                         assert_cc(EXEC_DIRECTORY_STATE < EXEC_DIRECTORY_LOGS);
2499                         assert_cc(EXEC_DIRECTORY_LOGS < EXEC_DIRECTORY_CONFIGURATION);
2500
2501                         r = laccess(p, F_OK);
2502                         if (r == -ENOENT) {
2503                                 _cleanup_free_ char *q = NULL;
2504
2505                                 /* OK, we know that the state dir does not exist. Let's see if the dir exists
2506                                  * under the configuration hierarchy. */
2507
2508                                 if (type == EXEC_DIRECTORY_STATE)
2509                                         q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], context->directories[type].items[i].path);
2510                                 else if (type == EXEC_DIRECTORY_LOGS)
2511                                         q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], "log", context->directories[type].items[i].path);
2512                                 else
2513                                         assert_not_reached();
2514                                 if (!q) {
2515                                         r = -ENOMEM;
2516                                         goto fail;
2517                                 }
2518
2519                                 r = laccess(q, F_OK);
2520                                 if (r >= 0) {
2521                                         /* It does exist! This hence looks like an update. Symlink the
2522                                          * configuration directory into the state directory. */
2523
2524                                         r = symlink_idempotent(q, p, /* make_relative= */ true);
2525                                         if (r < 0)
2526                                                 goto fail;
2527
2528                                         log_unit_notice(u, "Unit state directory %s missing but matching configuration directory %s exists, assuming update from systemd 253 or older, creating compatibility symlink.", p, q);
2529                                         continue;
2530                                 } else if (r != -ENOENT)
2531                                         log_unit_warning_errno(u, r, "Unable to detect whether unit configuration directory '%s' exists, assuming not: %m", q);
2532
2533                         } else if (r < 0)
2534                                 log_unit_warning_errno(u, r, "Unable to detect whether unit state directory '%s' is missing, assuming it is: %m", p);
2535                 }
2536
2537                 if (exec_directory_is_private(context, type)) {
2538                         /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2539                          * case we want to avoid leaving a directory around fully accessible that is owned by
2540                          * a dynamic user whose UID is later on reused. To lock this down we use the same
2541                          * trick used by container managers to prohibit host users to get access to files of
2542                          * the same UID in containers: we place everything inside a directory that has an
2543                          * access mode of 0700 and is owned root:root, so that it acts as security boundary
2544                          * for unprivileged host code. We then use fs namespacing to make this directory
2545                          * permeable for the service itself.
2546                          *
2547                          * Specifically: for a service which wants a special directory "foo/" we first create
2548                          * a directory "private/" with access mode 0700 owned by root:root. Then we place
2549                          * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2550                          * "private/foo". This way, privileged host users can access "foo/" as usual, but
2551                          * unprivileged host users can't look into it. Inside of the namespace of the unit
2552                          * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2553                          * "private/foo/" is mounted under the same name, thus disabling the access boundary
2554                          * for the service and making sure it only gets access to the dirs it needs but no
2555                          * others. Tricky? Yes, absolutely, but it works!
2556                          *
2557                          * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2558                          * to be owned by the service itself.
2559                          *
2560                          * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2561                          * for sharing files or sockets with other services. */
2562
2563                         pp = path_join(params->prefix[type], "private");
2564                         if (!pp) {
2565                                 r = -ENOMEM;
2566                                 goto fail;
2567                         }
2568
2569                         /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2570                         r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
2571                         if (r < 0)
2572                                 goto fail;
2573
2574                         if (!path_extend(&pp, context->directories[type].items[i].path)) {
2575                                 r = -ENOMEM;
2576                                 goto fail;
2577                         }
2578
2579                         /* Create all directories between the configured directory and this private root, and mark them 0755 */
2580                         r = mkdir_parents_label(pp, 0755);
2581                         if (r < 0)
2582                                 goto fail;
2583
2584                         if (is_dir(p, false) > 0 &&
2585                             (laccess(pp, F_OK) == -ENOENT)) {
2586
2587                                 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2588                                  * it over. Most likely the service has been upgraded from one that didn't use
2589                                  * DynamicUser=1, to one that does. */
2590
2591                                 log_unit_info(u, "Found pre-existing public %s= directory %s, migrating to %s.\n"
2592                                               "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2593                                               exec_directory_type_to_string(type), p, pp);
2594
2595                                 r = RET_NERRNO(rename(p, pp));
2596                                 if (r < 0)
2597                                         goto fail;
2598                         } else {
2599                                 /* Otherwise, create the actual directory for the service */
2600
2601                                 r = mkdir_label(pp, context->directories[type].mode);
2602                                 if (r < 0 && r != -EEXIST)
2603                                         goto fail;
2604                         }
2605
2606                         if (!context->directories[type].items[i].only_create) {
2607                                 /* And link it up from the original place.
2608                                  * Notes
2609                                  * 1) If a mount namespace is going to be used, then this symlink remains on
2610                                  *    the host, and a new one for the child namespace will be created later.
2611                                  * 2) It is not necessary to create this symlink when one of its parent
2612                                  *    directories is specified and already created. E.g.
2613                                  *        StateDirectory=foo foo/bar
2614                                  *    In that case, the inode points to pp and p for "foo/bar" are the same:
2615                                  *        pp = "/var/lib/private/foo/bar"
2616                                  *        p = "/var/lib/foo/bar"
2617                                  *    and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2618                                  *    we do not need to create the symlink, but we cannot create the symlink.
2619                                  *    See issue #24783. */
2620                                 r = symlink_idempotent(pp, p, true);
2621                                 if (r < 0)
2622                                         goto fail;
2623                         }
2624
2625                 } else {
2626                         _cleanup_free_ char *target = NULL;
2627
2628                         if (type != EXEC_DIRECTORY_CONFIGURATION &&
2629                             readlink_and_make_absolute(p, &target) >= 0) {
2630                                 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2631
2632                                 /* This already exists and is a symlink? Interesting. Maybe it's one created
2633                                  * by DynamicUser=1 (see above)?
2634                                  *
2635                                  * We do this for all directory types except for ConfigurationDirectory=,
2636                                  * since they all support the private/ symlink logic at least in some
2637                                  * configurations, see above. */
2638
2639                                 r = chase(target, NULL, 0, &target_resolved, NULL);
2640                                 if (r < 0)
2641                                         goto fail;
2642
2643                                 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
2644                                 if (!q) {
2645                                         r = -ENOMEM;
2646                                         goto fail;
2647                                 }
2648
2649                                 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2650                                 r = chase(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2651                                 if (r < 0)
2652                                         goto fail;
2653
2654                                 if (path_equal(q_resolved, target_resolved)) {
2655
2656                                         /* Hmm, apparently DynamicUser= was once turned on for this service,
2657                                          * but is no longer. Let's move the directory back up. */
2658
2659                                         log_unit_info(u, "Found pre-existing private %s= directory %s, migrating to %s.\n"
2660                                                       "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2661                                                       exec_directory_type_to_string(type), q, p);
2662
2663                                         r = RET_NERRNO(unlink(p));
2664                                         if (r < 0)
2665                                                 goto fail;
2666
2667                                         r = RET_NERRNO(rename(q, p));
2668                                         if (r < 0)
2669                                                 goto fail;
2670                                 }
2671                         }
2672
2673                         r = mkdir_label(p, context->directories[type].mode);
2674                         if (r < 0) {
2675                                 if (r != -EEXIST)
2676                                         goto fail;
2677
2678                                 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2679                                         struct stat st;
2680
2681                                         /* Don't change the owner/access mode of the configuration directory,
2682                                          * as in the common case it is not written to by a service, and shall
2683                                          * not be writable. */
2684
2685                                         r = RET_NERRNO(stat(p, &st));
2686                                         if (r < 0)
2687                                                 goto fail;
2688
2689                                         /* Still complain if the access mode doesn't match */
2690                                         if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2691                                                 log_unit_warning(u, "%s \'%s\' already exists but the mode is different. "
2692                                                                  "(File system: %o %sMode: %o)",
2693                                                                  exec_directory_type_to_string(type), context->directories[type].items[i].path,
2694                                                                  st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2695
2696                                         continue;
2697                                 }
2698                         }
2699                 }
2700
2701                 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2702                  * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2703                  * current UID/GID ownership.) */
2704                 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2705                 if (r < 0)
2706                         goto fail;
2707
2708                 /* Skip the rest (which deals with ownership) in user mode, since ownership changes are not
2709                  * available to user code anyway */
2710                 if (params->runtime_scope != RUNTIME_SCOPE_SYSTEM)
2711                         continue;
2712
2713                 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2714                  * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2715                  * assignments to exist. */
2716                 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777, AT_SYMLINK_FOLLOW);
2717                 if (r < 0)
2718                         goto fail;
2719         }
2720
2721         /* If we are not going to run in a namespace, set up the symlinks - otherwise
2722          * they are set up later, to allow configuring empty var/run/etc. */
2723         if (!needs_mount_namespace)
2724                 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2725                         r = create_many_symlinks(params->prefix[type],
2726                                                  context->directories[type].items[i].path,
2727                                                  context->directories[type].items[i].symlinks);
2728                         if (r < 0)
2729                                 goto fail;
2730                 }
2731
2732         return 0;
2733
2734 fail:
2735         *exit_status = exit_status_table[type];
2736         return r;
2737 }
2738
2739 #if ENABLE_SMACK
2740 static int setup_smack(
2741                 const Manager *manager,
2742                 const ExecContext *context,
2743                 int executable_fd) {
2744         int r;
2745
2746         assert(context);
2747         assert(executable_fd >= 0);
2748
2749         if (context->smack_process_label) {
2750                 r = mac_smack_apply_pid(0, context->smack_process_label);
2751                 if (r < 0)
2752                         return r;
2753         } else if (manager->defaults.smack_process_label) {
2754                 _cleanup_free_ char *exec_label = NULL;
2755
2756                 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
2757                 if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
2758                         return r;
2759
2760                 r = mac_smack_apply_pid(0, exec_label ?: manager->defaults.smack_process_label);
2761                 if (r < 0)
2762                         return r;
2763         }
2764
2765         return 0;
2766 }
2767 #endif
2768
2769 static int compile_bind_mounts(
2770                 const ExecContext *context,
2771                 const ExecParameters *params,
2772                 BindMount **ret_bind_mounts,
2773                 size_t *ret_n_bind_mounts,
2774                 char ***ret_empty_directories) {
2775
2776         _cleanup_strv_free_ char **empty_directories = NULL;
2777         BindMount *bind_mounts = NULL;
2778         size_t n, h = 0;
2779         int r;
2780
2781         assert(context);
2782         assert(params);
2783         assert(ret_bind_mounts);
2784         assert(ret_n_bind_mounts);
2785         assert(ret_empty_directories);
2786
2787         CLEANUP_ARRAY(bind_mounts, h, bind_mount_free_many);
2788
2789         n = context->n_bind_mounts;
2790         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2791                 if (!params->prefix[t])
2792                         continue;
2793
2794                 for (size_t i = 0; i < context->directories[t].n_items; i++)
2795                         n += !context->directories[t].items[i].only_create;
2796         }
2797
2798         if (n <= 0) {
2799                 *ret_bind_mounts = NULL;
2800                 *ret_n_bind_mounts = 0;
2801                 *ret_empty_directories = NULL;
2802                 return 0;
2803         }
2804
2805         bind_mounts = new(BindMount, n);
2806         if (!bind_mounts)
2807                 return -ENOMEM;
2808
2809         for (size_t i = 0; i < context->n_bind_mounts; i++) {
2810                 BindMount *item = context->bind_mounts + i;
2811                 _cleanup_free_ char *s = NULL, *d = NULL;
2812
2813                 s = strdup(item->source);
2814                 if (!s)
2815                         return -ENOMEM;
2816
2817                 d = strdup(item->destination);
2818                 if (!d)
2819                         return -ENOMEM;
2820
2821                 bind_mounts[h++] = (BindMount) {
2822                         .source = TAKE_PTR(s),
2823                         .destination = TAKE_PTR(d),
2824                         .read_only = item->read_only,
2825                         .recursive = item->recursive,
2826                         .ignore_enoent = item->ignore_enoent,
2827                 };
2828         }
2829
2830         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2831                 if (!params->prefix[t])
2832                         continue;
2833
2834                 if (context->directories[t].n_items == 0)
2835                         continue;
2836
2837                 if (exec_directory_is_private(context, t) &&
2838                     !exec_context_with_rootfs(context)) {
2839                         char *private_root;
2840
2841                         /* So this is for a dynamic user, and we need to make sure the process can access its own
2842                          * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2843                          * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2844
2845                         private_root = path_join(params->prefix[t], "private");
2846                         if (!private_root)
2847                                 return -ENOMEM;
2848
2849                         r = strv_consume(&empty_directories, private_root);
2850                         if (r < 0)
2851                                 return r;
2852                 }
2853
2854                 for (size_t i = 0; i < context->directories[t].n_items; i++) {
2855                         _cleanup_free_ char *s = NULL, *d = NULL;
2856
2857                         /* When one of the parent directories is in the list, we cannot create the symlink
2858                          * for the child directory. See also the comments in setup_exec_directory(). */
2859                         if (context->directories[t].items[i].only_create)
2860                                 continue;
2861
2862                         if (exec_directory_is_private(context, t))
2863                                 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
2864                         else
2865                                 s = path_join(params->prefix[t], context->directories[t].items[i].path);
2866                         if (!s)
2867                                 return -ENOMEM;
2868
2869                         if (exec_directory_is_private(context, t) &&
2870                             exec_context_with_rootfs(context))
2871                                 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2872                                  * directory is not created on the root directory. So, let's bind-mount the directory
2873                                  * on the 'non-private' place. */
2874                                 d = path_join(params->prefix[t], context->directories[t].items[i].path);
2875                         else
2876                                 d = strdup(s);
2877                         if (!d)
2878                                 return -ENOMEM;
2879
2880                         bind_mounts[h++] = (BindMount) {
2881                                 .source = TAKE_PTR(s),
2882                                 .destination = TAKE_PTR(d),
2883                                 .read_only = false,
2884                                 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
2885                                 .recursive = true,
2886                                 .ignore_enoent = false,
2887                         };
2888                 }
2889         }
2890
2891         assert(h == n);
2892
2893         *ret_bind_mounts = TAKE_PTR(bind_mounts);
2894         *ret_n_bind_mounts = n;
2895         *ret_empty_directories = TAKE_PTR(empty_directories);
2896
2897         return (int) n;
2898 }
2899
2900 /* ret_symlinks will contain a list of pairs src:dest that describes
2901  * the symlinks to create later on. For example, the symlinks needed
2902  * to safely give private directories to DynamicUser=1 users. */
2903 static int compile_symlinks(
2904                 const ExecContext *context,
2905                 const ExecParameters *params,
2906                 bool setup_os_release_symlink,
2907                 char ***ret_symlinks) {
2908
2909         _cleanup_strv_free_ char **symlinks = NULL;
2910         int r;
2911
2912         assert(context);
2913         assert(params);
2914         assert(ret_symlinks);
2915
2916         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
2917                 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
2918                         _cleanup_free_ char *private_path = NULL, *path = NULL;
2919
2920                         STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
2921                                 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
2922
2923                                 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
2924                                 dst_abs = path_join(params->prefix[dt], *symlink);
2925                                 if (!src_abs || !dst_abs)
2926                                         return -ENOMEM;
2927
2928                                 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
2929                                 if (r < 0)
2930                                         return r;
2931                         }
2932
2933                         if (!exec_directory_is_private(context, dt) ||
2934                             exec_context_with_rootfs(context) ||
2935                             context->directories[dt].items[i].only_create)
2936                                 continue;
2937
2938                         private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
2939                         if (!private_path)
2940                                 return -ENOMEM;
2941
2942                         path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
2943                         if (!path)
2944                                 return -ENOMEM;
2945
2946                         r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
2947                         if (r < 0)
2948                                 return r;
2949                 }
2950         }
2951
2952         /* We make the host's os-release available via a symlink, so that we can copy it atomically
2953          * and readers will never get a half-written version. Note that, while the paths specified here are
2954          * absolute, when they are processed in namespace.c they will be made relative automatically, i.e.:
2955          * 'os-release -> .os-release-stage/os-release' is what will be created. */
2956         if (setup_os_release_symlink) {
2957                 r = strv_extend(&symlinks, "/run/host/.os-release-stage/os-release");
2958                 if (r < 0)
2959                         return r;
2960
2961                 r = strv_extend(&symlinks, "/run/host/os-release");
2962                 if (r < 0)
2963                         return r;
2964         }
2965
2966         *ret_symlinks = TAKE_PTR(symlinks);
2967
2968         return 0;
2969 }
2970
2971 static bool insist_on_sandboxing(
2972                 const ExecContext *context,
2973                 const char *root_dir,
2974                 const char *root_image,
2975                 const BindMount *bind_mounts,
2976                 size_t n_bind_mounts) {
2977
2978         assert(context);
2979         assert(n_bind_mounts == 0 || bind_mounts);
2980
2981         /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
2982          * would alter the view on the file system beyond making things read-only or invisible, i.e. would
2983          * rearrange stuff in a way we cannot ignore gracefully. */
2984
2985         if (context->n_temporary_filesystems > 0)
2986                 return true;
2987
2988         if (root_dir || root_image)
2989                 return true;
2990
2991         if (context->n_mount_images > 0)
2992                 return true;
2993
2994         if (context->dynamic_user)
2995                 return true;
2996
2997         if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
2998                 return true;
2999
3000         /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3001          * essential. */
3002         for (size_t i = 0; i < n_bind_mounts; i++)
3003                 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
3004                         return true;
3005
3006         if (context->log_namespace)
3007                 return true;
3008
3009         return false;
3010 }
3011
3012 static int setup_ephemeral(const ExecContext *context, ExecRuntime *runtime) {
3013         _cleanup_close_ int fd = -EBADF;
3014         int r;
3015
3016         if (!runtime || !runtime->ephemeral_copy)
3017                 return 0;
3018
3019         r = posix_lock(runtime->ephemeral_storage_socket[0], LOCK_EX);
3020         if (r < 0)
3021                 return log_debug_errno(r, "Failed to lock ephemeral storage socket: %m");
3022
3023         CLEANUP_POSIX_UNLOCK(runtime->ephemeral_storage_socket[0]);
3024
3025         fd = receive_one_fd(runtime->ephemeral_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
3026         if (fd >= 0)
3027                 /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */
3028                 return 0;
3029
3030         if (fd != -EAGAIN)
3031                 return log_debug_errno(fd, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
3032
3033         log_debug("Making ephemeral snapshot of %s to %s",
3034                   context->root_image ?: context->root_directory, runtime->ephemeral_copy);
3035
3036         if (context->root_image)
3037                 fd = copy_file(context->root_image, runtime->ephemeral_copy, O_EXCL, 0600,
3038                                COPY_LOCK_BSD|COPY_REFLINK|COPY_CRTIME);
3039         else
3040                 fd = btrfs_subvol_snapshot_at(AT_FDCWD, context->root_directory,
3041                                               AT_FDCWD, runtime->ephemeral_copy,
3042                                               BTRFS_SNAPSHOT_FALLBACK_COPY |
3043                                               BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
3044                                               BTRFS_SNAPSHOT_RECURSIVE |
3045                                               BTRFS_SNAPSHOT_LOCK_BSD);
3046         if (fd < 0)
3047                 return log_debug_errno(fd, "Failed to snapshot %s to %s: %m",
3048                                        context->root_image ?: context->root_directory, runtime->ephemeral_copy);
3049
3050         if (context->root_image) {
3051                 /* A root image might be subject to lots of random writes so let's try to disable COW on it
3052                  * which tends to not perform well in combination with lots of random writes.
3053                  *
3054                  * Note: btrfs actually isn't impressed by us setting the flag after making the reflink'ed
3055                  * copy, but we at least want to make the intention clear.
3056                  */
3057                 r = chattr_fd(fd, FS_NOCOW_FL, FS_NOCOW_FL, NULL);
3058                 if (r < 0)
3059                         log_debug_errno(fd, "Failed to disable copy-on-write for %s, ignoring: %m", runtime->ephemeral_copy);
3060         }
3061
3062         r = send_one_fd(runtime->ephemeral_storage_socket[1], fd, MSG_DONTWAIT);
3063         if (r < 0)
3064                 return log_debug_errno(r, "Failed to queue file descriptor on ephemeral storage socket: %m");
3065
3066         return 1;
3067 }
3068
3069 static int verity_settings_prepare(
3070                 VeritySettings *verity,
3071                 const char *root_image,
3072                 const void *root_hash,
3073                 size_t root_hash_size,
3074                 const char *root_hash_path,
3075                 const void *root_hash_sig,
3076                 size_t root_hash_sig_size,
3077                 const char *root_hash_sig_path,
3078                 const char *verity_data_path) {
3079
3080         int r;
3081
3082         assert(verity);
3083
3084         if (root_hash) {
3085                 void *d;
3086
3087                 d = memdup(root_hash, root_hash_size);
3088                 if (!d)
3089                         return -ENOMEM;
3090
3091                 free_and_replace(verity->root_hash, d);
3092                 verity->root_hash_size = root_hash_size;
3093                 verity->designator = PARTITION_ROOT;
3094         }
3095
3096         if (root_hash_sig) {
3097                 void *d;
3098
3099                 d = memdup(root_hash_sig, root_hash_sig_size);
3100                 if (!d)
3101                         return -ENOMEM;
3102
3103                 free_and_replace(verity->root_hash_sig, d);
3104                 verity->root_hash_sig_size = root_hash_sig_size;
3105                 verity->designator = PARTITION_ROOT;
3106         }
3107
3108         if (verity_data_path) {
3109                 r = free_and_strdup(&verity->data_path, verity_data_path);
3110                 if (r < 0)
3111                         return r;
3112         }
3113
3114         r = verity_settings_load(
3115                         verity,
3116                         root_image,
3117                         root_hash_path,
3118                         root_hash_sig_path);
3119         if (r < 0)
3120                 return log_debug_errno(r, "Failed to load root hash: %m");
3121
3122         return 0;
3123 }
3124
3125 static int apply_mount_namespace(
3126                 const Unit *u,
3127                 ExecCommandFlags command_flags,
3128                 const ExecContext *context,
3129                 const ExecParameters *params,
3130                 ExecRuntime *runtime,
3131                 const char *memory_pressure_path,
3132                 char **error_path) {
3133
3134         _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
3135         _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
3136                         **read_write_paths_cleanup = NULL;
3137         _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
3138                         *extension_dir = NULL, *host_os_release_stage = NULL;
3139         const char *root_dir = NULL, *root_image = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
3140         char **read_write_paths;
3141         bool needs_sandboxing, setup_os_release_symlink;
3142         BindMount *bind_mounts = NULL;
3143         size_t n_bind_mounts = 0;
3144         int r;
3145
3146         assert(context);
3147
3148         CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many);
3149
3150         if (params->flags & EXEC_APPLY_CHROOT) {
3151                 r = setup_ephemeral(context, runtime);
3152                 if (r < 0)
3153                         return r;
3154
3155                 if (context->root_image)
3156                         root_image = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_image;
3157                 else
3158                         root_dir = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory;
3159         }
3160
3161         r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3162         if (r < 0)
3163                 return r;
3164
3165         /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
3166          * service will need to write to it in order to start the notifications. */
3167         if (context->protect_control_groups && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
3168                 read_write_paths_cleanup = strv_copy(context->read_write_paths);
3169                 if (!read_write_paths_cleanup)
3170                         return -ENOMEM;
3171
3172                 r = strv_extend(&read_write_paths_cleanup, memory_pressure_path);
3173                 if (r < 0)
3174                         return r;
3175
3176                 read_write_paths = read_write_paths_cleanup;
3177         } else
3178                 read_write_paths = context->read_write_paths;
3179
3180         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3181         if (needs_sandboxing) {
3182                 /* The runtime struct only contains the parent of the private /tmp, which is non-accessible
3183                  * to world users. Inside of it there's a /tmp that is sticky, and that's the one we want to
3184                  * use here.  This does not apply when we are using /run/systemd/empty as fallback. */
3185
3186                 if (context->private_tmp && runtime && runtime->shared) {
3187                         if (streq_ptr(runtime->shared->tmp_dir, RUN_SYSTEMD_EMPTY))
3188                                 tmp_dir = runtime->shared->tmp_dir;
3189                         else if (runtime->shared->tmp_dir)
3190                                 tmp_dir = strjoina(runtime->shared->tmp_dir, "/tmp");
3191
3192                         if (streq_ptr(runtime->shared->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3193                                 var_tmp_dir = runtime->shared->var_tmp_dir;
3194                         else if (runtime->shared->var_tmp_dir)
3195                                 var_tmp_dir = strjoina(runtime->shared->var_tmp_dir, "/tmp");
3196                 }
3197         }
3198
3199         /* Symlinks (exec dirs, os-release) are set up after other mounts, before they are made read-only. */
3200         setup_os_release_symlink = needs_sandboxing && exec_context_get_effective_mount_apivfs(context) && (root_dir || root_image);
3201         r = compile_symlinks(context, params, setup_os_release_symlink, &symlinks);
3202         if (r < 0)
3203                 return r;
3204
3205         if (context->mount_propagation_flag == MS_SHARED)
3206                 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3207
3208         if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
3209                 r = exec_context_get_credential_directory(context, params, u->id, &creds_path);
3210                 if (r < 0)
3211                         return r;
3212         }
3213
3214         if (params->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
3215                 propagate_dir = path_join("/run/systemd/propagate/", u->id);
3216                 if (!propagate_dir)
3217                         return -ENOMEM;
3218
3219                 incoming_dir = strdup("/run/systemd/incoming");
3220                 if (!incoming_dir)
3221                         return -ENOMEM;
3222
3223                 extension_dir = strdup("/run/systemd/unit-extensions");
3224                 if (!extension_dir)
3225                         return -ENOMEM;
3226
3227                 /* If running under a different root filesystem, propagate the host's os-release. We make a
3228                  * copy rather than just bind mounting it, so that it can be updated on soft-reboot. */
3229                 if (setup_os_release_symlink) {
3230                         host_os_release_stage = strdup("/run/systemd/propagate/.os-release-stage");
3231                         if (!host_os_release_stage)
3232                                 return -ENOMEM;
3233                 }
3234         } else {
3235                 assert(params->runtime_scope == RUNTIME_SCOPE_USER);
3236
3237                 if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0)
3238                         return -ENOMEM;
3239
3240                 if (setup_os_release_symlink) {
3241                         if (asprintf(&host_os_release_stage,
3242                                      "/run/user/" UID_FMT "/systemd/propagate/.os-release-stage",
3243                                      geteuid()) < 0)
3244                                 return -ENOMEM;
3245                 }
3246         }
3247
3248         if (root_image) {
3249                 r = verity_settings_prepare(
3250                         &verity,
3251                         root_image,
3252                         context->root_hash, context->root_hash_size, context->root_hash_path,
3253                         context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3254                         context->root_verity);
3255                 if (r < 0)
3256                         return r;
3257         }
3258
3259         NamespaceParameters parameters = {
3260                 .runtime_scope = params->runtime_scope,
3261
3262                 .root_directory = root_dir,
3263                 .root_image = root_image,
3264                 .root_image_options = context->root_image_options,
3265                 .root_image_policy = context->root_image_policy ?: &image_policy_service,
3266
3267                 .read_write_paths = read_write_paths,
3268                 .read_only_paths = needs_sandboxing ? context->read_only_paths : NULL,
3269                 .inaccessible_paths = needs_sandboxing ? context->inaccessible_paths : NULL,
3270
3271                 .exec_paths = needs_sandboxing ? context->exec_paths : NULL,
3272                 .no_exec_paths = needs_sandboxing ? context->no_exec_paths : NULL,
3273
3274                 .empty_directories = empty_directories,
3275                 .symlinks = symlinks,
3276
3277                 .bind_mounts = bind_mounts,
3278                 .n_bind_mounts = n_bind_mounts,
3279
3280                 .temporary_filesystems = context->temporary_filesystems,
3281                 .n_temporary_filesystems = context->n_temporary_filesystems,
3282
3283                 .mount_images = context->mount_images,
3284                 .n_mount_images = context->n_mount_images,
3285                 .mount_image_policy = context->mount_image_policy ?: &image_policy_service,
3286
3287                 .tmp_dir = tmp_dir,
3288                 .var_tmp_dir = var_tmp_dir,
3289
3290                 .creds_path = creds_path,
3291                 .log_namespace = context->log_namespace,
3292                 .mount_propagation_flag = context->mount_propagation_flag,
3293
3294                 .verity = &verity,
3295
3296                 .extension_images = context->extension_images,
3297                 .n_extension_images = context->n_extension_images,
3298                 .extension_image_policy = context->extension_image_policy ?: &image_policy_sysext,
3299                 .extension_directories = context->extension_directories,
3300
3301                 .propagate_dir = propagate_dir,
3302                 .incoming_dir = incoming_dir,
3303                 .extension_dir = extension_dir,
3304                 .notify_socket = root_dir || root_image ? params->notify_socket : NULL,
3305                 .host_os_release_stage = host_os_release_stage,
3306
3307                 /* If DynamicUser=no and RootDirectory= is set then lets pass a relaxed sandbox info,
3308                  * otherwise enforce it, don't ignore protected paths and fail if we are enable to apply the
3309                  * sandbox inside the mount namespace. */
3310                 .ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && root_dir,
3311
3312                 .protect_control_groups = needs_sandboxing && context->protect_control_groups,
3313                 .protect_kernel_tunables = needs_sandboxing && context->protect_kernel_tunables,
3314                 .protect_kernel_modules = needs_sandboxing && context->protect_kernel_modules,
3315                 .protect_kernel_logs = needs_sandboxing && context->protect_kernel_logs,
3316                 .protect_hostname = needs_sandboxing && context->protect_hostname,
3317
3318                 .private_dev = needs_sandboxing && context->private_devices,
3319                 .private_network = needs_sandboxing && exec_needs_network_namespace(context),
3320                 .private_ipc = needs_sandboxing && exec_needs_ipc_namespace(context),
3321
3322                 .mount_apivfs = needs_sandboxing && exec_context_get_effective_mount_apivfs(context),
3323
3324                 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
3325                 .mount_nosuid = needs_sandboxing && context->no_new_privileges && !mac_selinux_use(),
3326
3327                 .protect_home = needs_sandboxing && context->protect_home,
3328                 .protect_system = needs_sandboxing && context->protect_system,
3329                 .protect_proc = needs_sandboxing && context->protect_proc,
3330                 .proc_subset = needs_sandboxing && context->proc_subset,
3331         };
3332
3333         r = setup_namespace(&parameters, error_path);
3334         /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
3335          * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
3336          * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3337          * completely different execution environment. */
3338         if (r == -ENOANO) {
3339                 if (insist_on_sandboxing(
3340                                     context,
3341                                     root_dir, root_image,
3342                                     bind_mounts,
3343                                     n_bind_mounts))
3344                         return log_unit_debug_errno(u,
3345                                                     SYNTHETIC_ERRNO(EOPNOTSUPP),
3346                                                     "Failed to set up namespace, and refusing to continue since "
3347                                                     "the selected namespacing options alter mount environment non-trivially.\n"
3348                                                     "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3349                                                     n_bind_mounts,
3350                                                     context->n_temporary_filesystems,
3351                                                     yes_no(root_dir),
3352                                                     yes_no(root_image),
3353                                                     yes_no(context->dynamic_user));
3354
3355                 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
3356                 return 0;
3357         }
3358
3359         return r;
3360 }
3361
3362 static int apply_working_directory(
3363                 const ExecContext *context,
3364                 const ExecParameters *params,
3365                 ExecRuntime *runtime,
3366                 const char *home,
3367                 int *exit_status) {
3368
3369         const char *d, *wd;
3370
3371         assert(context);
3372         assert(exit_status);
3373
3374         if (context->working_directory_home) {
3375
3376                 if (!home) {
3377                         *exit_status = EXIT_CHDIR;
3378                         return -ENXIO;
3379                 }
3380
3381                 wd = home;
3382
3383         } else
3384                 wd = empty_to_root(context->working_directory);
3385
3386         if (params->flags & EXEC_APPLY_CHROOT)
3387                 d = wd;
3388         else
3389                 d = prefix_roota((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory, wd);
3390
3391         if (chdir(d) < 0 && !context->working_directory_missing_ok) {
3392                 *exit_status = EXIT_CHDIR;
3393                 return -errno;
3394         }
3395
3396         return 0;
3397 }
3398
3399 static int apply_root_directory(
3400                 const ExecContext *context,
3401                 const ExecParameters *params,
3402                 ExecRuntime *runtime,
3403                 const bool needs_mount_ns,
3404                 int *exit_status) {
3405
3406         assert(context);
3407         assert(exit_status);
3408
3409         if (params->flags & EXEC_APPLY_CHROOT)
3410                 if (!needs_mount_ns && context->root_directory)
3411                         if (chroot((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory) < 0) {
3412                                 *exit_status = EXIT_CHROOT;
3413                                 return -errno;
3414                         }
3415
3416         return 0;
3417 }
3418
3419 static int setup_keyring(
3420                 const Unit *u,
3421                 const ExecContext *context,
3422                 const ExecParameters *p,
3423                 uid_t uid, gid_t gid) {
3424
3425         key_serial_t keyring;
3426         int r = 0;
3427         uid_t saved_uid;
3428         gid_t saved_gid;
3429
3430         assert(u);
3431         assert(context);
3432         assert(p);
3433
3434         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3435          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3436          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3437          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3438          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3439          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3440
3441         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3442                 return 0;
3443
3444         /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3445          * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3446          * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3447          * & group is just as nasty as acquiring a reference to the user keyring. */
3448
3449         saved_uid = getuid();
3450         saved_gid = getgid();
3451
3452         if (gid_is_valid(gid) && gid != saved_gid) {
3453                 if (setregid(gid, -1) < 0)
3454                         return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
3455         }
3456
3457         if (uid_is_valid(uid) && uid != saved_uid) {
3458                 if (setreuid(uid, -1) < 0) {
3459                         r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
3460                         goto out;
3461                 }
3462         }
3463
3464         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3465         if (keyring == -1) {
3466                 if (errno == ENOSYS)
3467                         log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
3468                 else if (ERRNO_IS_PRIVILEGE(errno))
3469                         log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
3470                 else if (errno == EDQUOT)
3471                         log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
3472                 else
3473                         r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
3474
3475                 goto out;
3476         }
3477
3478         /* When requested link the user keyring into the session keyring. */
3479         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3480
3481                 if (keyctl(KEYCTL_LINK,
3482                            KEY_SPEC_USER_KEYRING,
3483                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3484                         r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
3485                         goto out;
3486                 }
3487         }
3488
3489         /* Restore uid/gid back */
3490         if (uid_is_valid(uid) && uid != saved_uid) {
3491                 if (setreuid(saved_uid, -1) < 0) {
3492                         r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
3493                         goto out;
3494                 }
3495         }
3496
3497         if (gid_is_valid(gid) && gid != saved_gid) {
3498                 if (setregid(saved_gid, -1) < 0)
3499                         return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
3500         }
3501
3502         /* Populate they keyring with the invocation ID by default, as original saved_uid. */
3503         if (!sd_id128_is_null(u->invocation_id)) {
3504                 key_serial_t key;
3505
3506                 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
3507                 if (key == -1)
3508                         log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
3509                 else {
3510                         if (keyctl(KEYCTL_SETPERM, key,
3511                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3512                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
3513                                 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
3514                 }
3515         }
3516
3517 out:
3518         /* Revert back uid & gid for the last time, and exit */
3519         /* no extra logging, as only the first already reported error matters */
3520         if (getuid() != saved_uid)
3521                 (void) setreuid(saved_uid, -1);
3522
3523         if (getgid() != saved_gid)
3524                 (void) setregid(saved_gid, -1);
3525
3526         return r;
3527 }
3528
3529 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
3530         assert(array);
3531         assert(n);
3532         assert(pair);
3533
3534         if (pair[0] >= 0)
3535                 array[(*n)++] = pair[0];
3536         if (pair[1] >= 0)
3537                 array[(*n)++] = pair[1];
3538 }
3539
3540 static int close_remaining_fds(
3541                 const ExecParameters *params,
3542                 const ExecRuntime *runtime,
3543                 int user_lookup_fd,
3544                 int socket_fd,
3545                 const int *fds, size_t n_fds) {
3546
3547         size_t n_dont_close = 0;
3548         int dont_close[n_fds + 14];
3549
3550         assert(params);
3551
3552         if (params->stdin_fd >= 0)
3553                 dont_close[n_dont_close++] = params->stdin_fd;
3554         if (params->stdout_fd >= 0)
3555                 dont_close[n_dont_close++] = params->stdout_fd;
3556         if (params->stderr_fd >= 0)
3557                 dont_close[n_dont_close++] = params->stderr_fd;
3558
3559         if (socket_fd >= 0)
3560                 dont_close[n_dont_close++] = socket_fd;
3561         if (n_fds > 0) {
3562                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3563                 n_dont_close += n_fds;
3564         }
3565
3566         if (runtime)
3567                 append_socket_pair(dont_close, &n_dont_close, runtime->ephemeral_storage_socket);
3568
3569         if (runtime && runtime->shared) {
3570                 append_socket_pair(dont_close, &n_dont_close, runtime->shared->netns_storage_socket);
3571                 append_socket_pair(dont_close, &n_dont_close, runtime->shared->ipcns_storage_socket);
3572         }
3573
3574         if (runtime && runtime->dynamic_creds) {
3575                 if (runtime->dynamic_creds->user)
3576                         append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->user->storage_socket);
3577                 if (runtime->dynamic_creds->group)
3578                         append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->group->storage_socket);
3579         }
3580
3581         if (user_lookup_fd >= 0)
3582                 dont_close[n_dont_close++] = user_lookup_fd;
3583
3584         return close_all_fds(dont_close, n_dont_close);
3585 }
3586
3587 static int send_user_lookup(
3588                 Unit *unit,
3589                 int user_lookup_fd,
3590                 uid_t uid,
3591                 gid_t gid) {
3592
3593         assert(unit);
3594
3595         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3596          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3597          * specified. */
3598
3599         if (user_lookup_fd < 0)
3600                 return 0;
3601
3602         if (!uid_is_valid(uid) && !gid_is_valid(gid))
3603                 return 0;
3604
3605         if (writev(user_lookup_fd,
3606                (struct iovec[]) {
3607                            IOVEC_MAKE(&uid, sizeof(uid)),
3608                            IOVEC_MAKE(&gid, sizeof(gid)),
3609                            IOVEC_MAKE_STRING(unit->id) }, 3) < 0)
3610                 return -errno;
3611
3612         return 0;
3613 }
3614
3615 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3616         int r;
3617
3618         assert(c);
3619         assert(home);
3620         assert(buf);
3621
3622         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3623
3624         if (*home)
3625                 return 0;
3626
3627         if (!c->working_directory_home)
3628                 return 0;
3629
3630         r = get_home_dir(buf);
3631         if (r < 0)
3632                 return r;
3633
3634         *home = *buf;
3635         return 1;
3636 }
3637
3638 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3639         _cleanup_strv_free_ char ** list = NULL;
3640         int r;
3641
3642         assert(c);
3643         assert(p);
3644         assert(ret);
3645
3646         assert(c->dynamic_user);
3647
3648         /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3649          * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3650          * directories. */
3651
3652         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3653                 if (t == EXEC_DIRECTORY_CONFIGURATION)
3654                         continue;
3655
3656                 if (!p->prefix[t])
3657                         continue;
3658
3659                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
3660                         char *e;
3661
3662                         if (exec_directory_is_private(c, t))
3663                                 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
3664                         else
3665                                 e = path_join(p->prefix[t], c->directories[t].items[i].path);
3666                         if (!e)
3667                                 return -ENOMEM;
3668
3669                         r = strv_consume(&list, e);
3670                         if (r < 0)
3671                                 return r;
3672                 }
3673         }
3674
3675         *ret = TAKE_PTR(list);
3676
3677         return 0;
3678 }
3679
3680 static int exec_parameters_get_cgroup_path(
3681                 const ExecParameters *params,
3682                 const CGroupContext *c,
3683                 char **ret) {
3684
3685         const char *subgroup = NULL;
3686         char *p;
3687
3688         assert(params);
3689         assert(ret);
3690
3691         if (!params->cgroup_path)
3692                 return -EINVAL;
3693
3694         /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
3695          * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
3696          * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
3697          * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
3698          * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
3699          * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
3700          * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
3701          * flag, which is only passed for the former statements, not for the latter. */
3702
3703         if (FLAGS_SET(params->flags, EXEC_CGROUP_DELEGATE) && (FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP) || c->delegate_subgroup)) {
3704                 if (FLAGS_SET(params->flags, EXEC_IS_CONTROL))
3705                         subgroup = ".control";
3706                 else
3707                         subgroup = c->delegate_subgroup;
3708         }
3709
3710         if (subgroup)
3711                 p = path_join(params->cgroup_path, subgroup);
3712         else
3713                 p = strdup(params->cgroup_path);
3714         if (!p)
3715                 return -ENOMEM;
3716
3717         *ret = p;
3718         return !!subgroup;
3719 }
3720
3721 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
3722         _cleanup_(cpu_set_reset) CPUSet s = {};
3723         int r;
3724
3725         assert(c);
3726         assert(ret);
3727
3728         if (!c->numa_policy.nodes.set) {
3729                 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3730                 return 0;
3731         }
3732
3733         r = numa_to_cpu_set(&c->numa_policy, &s);
3734         if (r < 0)
3735                 return r;
3736
3737         cpu_set_reset(ret);
3738
3739         return cpu_set_add_all(ret, &s);
3740 }
3741
3742 bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
3743         assert(c);
3744
3745         return c->cpu_affinity_from_numa;
3746 }
3747
3748 static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
3749         int r;
3750
3751         assert(fds);
3752         assert(n_fds);
3753         assert(*n_fds < fds_size);
3754         assert(ret_fd);
3755
3756         if (fd < 0) {
3757                 *ret_fd = -EBADF;
3758                 return 0;
3759         }
3760
3761         if (fd < 3 + (int) *n_fds) {
3762                 /* Let's move the fd up, so that it's outside of the fd range we will use to store
3763                  * the fds we pass to the process (or which are closed only during execve). */
3764
3765                 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
3766                 if (r < 0)
3767                         return -errno;
3768
3769                 close_and_replace(fd, r);
3770         }
3771
3772         *ret_fd = fds[*n_fds] = fd;
3773         (*n_fds) ++;
3774         return 1;
3775 }
3776
3777 static int connect_unix_harder(Unit *u, const OpenFile *of, int ofd) {
3778         union sockaddr_union addr = {
3779                 .un.sun_family = AF_UNIX,
3780         };
3781         socklen_t sa_len;
3782         static const int socket_types[] = { SOCK_DGRAM, SOCK_STREAM, SOCK_SEQPACKET };
3783         int r;
3784
3785         assert(u);
3786         assert(of);
3787         assert(ofd >= 0);
3788
3789         r = sockaddr_un_set_path(&addr.un, FORMAT_PROC_FD_PATH(ofd));
3790         if (r < 0)
3791                 return log_unit_error_errno(u, r, "Failed to set sockaddr for %s: %m", of->path);
3792
3793         sa_len = r;
3794
3795         for (size_t i = 0; i < ELEMENTSOF(socket_types); i++) {
3796                 _cleanup_close_ int fd = -EBADF;
3797
3798                 fd = socket(AF_UNIX, socket_types[i] | SOCK_CLOEXEC, 0);
3799                 if (fd < 0)
3800                         return log_unit_error_errno(u, errno, "Failed to create socket for %s: %m", of->path);
3801
3802                 r = RET_NERRNO(connect(fd, &addr.sa, sa_len));
3803                 if (r == -EPROTOTYPE)
3804                         continue;
3805                 if (r < 0)
3806                         return log_unit_error_errno(u, r, "Failed to connect socket for %s: %m", of->path);
3807
3808                 return TAKE_FD(fd);
3809         }
3810
3811         return log_unit_error_errno(u, SYNTHETIC_ERRNO(EPROTOTYPE), "Failed to connect socket for \"%s\".", of->path);
3812 }
3813
3814 static int get_open_file_fd(Unit *u, const OpenFile *of) {
3815         struct stat st;
3816         _cleanup_close_ int fd = -EBADF, ofd = -EBADF;
3817
3818         assert(u);
3819         assert(of);
3820
3821         ofd = open(of->path, O_PATH | O_CLOEXEC);
3822         if (ofd < 0)
3823                 return log_unit_error_errno(u, errno, "Could not open \"%s\": %m", of->path);
3824
3825         if (fstat(ofd, &st) < 0)
3826                 return log_unit_error_errno(u, errno, "Failed to stat %s: %m", of->path);
3827
3828         if (S_ISSOCK(st.st_mode)) {
3829                 fd = connect_unix_harder(u, of, ofd);
3830                 if (fd < 0)
3831                         return fd;
3832
3833                 if (FLAGS_SET(of->flags, OPENFILE_READ_ONLY) && shutdown(fd, SHUT_WR) < 0)
3834                         return log_unit_error_errno(u, errno, "Failed to shutdown send for socket %s: %m",
3835                                                     of->path);
3836
3837                 log_unit_debug(u, "socket %s opened (fd=%d)", of->path, fd);
3838         } else {
3839                 int flags = FLAGS_SET(of->flags, OPENFILE_READ_ONLY) ? O_RDONLY : O_RDWR;
3840                 if (FLAGS_SET(of->flags, OPENFILE_APPEND))
3841                         flags |= O_APPEND;
3842                 else if (FLAGS_SET(of->flags, OPENFILE_TRUNCATE))
3843                         flags |= O_TRUNC;
3844
3845                 fd = fd_reopen(ofd, flags | O_CLOEXEC);
3846                 if (fd < 0)
3847                         return log_unit_error_errno(u, fd, "Failed to open file %s: %m", of->path);
3848
3849                 log_unit_debug(u, "file %s opened (fd=%d)", of->path, fd);
3850         }
3851
3852         return TAKE_FD(fd);
3853 }
3854
3855 static int collect_open_file_fds(
3856                 Unit *u,
3857                 OpenFile* open_files,
3858                 int **fds,
3859                 char ***fdnames,
3860                 size_t *n_fds) {
3861         int r;
3862
3863         assert(u);
3864         assert(fds);
3865         assert(fdnames);
3866         assert(n_fds);
3867
3868         LIST_FOREACH(open_files, of, open_files) {
3869                 _cleanup_close_ int fd = -EBADF;
3870
3871                 fd = get_open_file_fd(u, of);
3872                 if (fd < 0) {
3873                         if (FLAGS_SET(of->flags, OPENFILE_GRACEFUL)) {
3874                                 log_unit_debug_errno(u, fd, "Failed to get OpenFile= file descriptor for %s, ignoring: %m", of->path);
3875                                 continue;
3876                         }
3877
3878                         return fd;
3879                 }
3880
3881                 if (!GREEDY_REALLOC(*fds, *n_fds + 1))
3882                         return -ENOMEM;
3883
3884                 r = strv_extend(fdnames, of->fdname);
3885                 if (r < 0)
3886                         return r;
3887
3888                 (*fds)[*n_fds] = TAKE_FD(fd);
3889
3890                 (*n_fds)++;
3891         }
3892
3893         return 0;
3894 }
3895
3896 static void log_command_line(Unit *unit, const char *msg, const char *executable, char **argv) {
3897         assert(unit);
3898         assert(msg);
3899         assert(executable);
3900
3901         if (!DEBUG_LOGGING)
3902                 return;
3903
3904         _cleanup_free_ char *cmdline = quote_command_line(argv, SHELL_ESCAPE_EMPTY);
3905
3906         log_unit_struct(unit, LOG_DEBUG,
3907                         "EXECUTABLE=%s", executable,
3908                         LOG_UNIT_MESSAGE(unit, "%s: %s", msg, strnull(cmdline)),
3909                         LOG_UNIT_INVOCATION_ID(unit));
3910 }
3911
3912 static bool exec_context_need_unprivileged_private_users(
3913                 const ExecContext *context,
3914                 const ExecParameters *params) {
3915
3916         assert(context);
3917         assert(params);
3918
3919         /* These options require PrivateUsers= when used in user units, as we need to be in a user namespace
3920          * to have permission to enable them when not running as root. If we have effective CAP_SYS_ADMIN
3921          * (system manager) then we have privileges and don't need this. */
3922         if (params->runtime_scope != RUNTIME_SCOPE_USER)
3923                 return false;
3924
3925         return context->private_users ||
3926                context->private_tmp ||
3927                context->private_devices ||
3928                context->private_network ||
3929                context->network_namespace_path ||
3930                context->private_ipc ||
3931                context->ipc_namespace_path ||
3932                context->private_mounts > 0 ||
3933                context->mount_apivfs ||
3934                context->n_bind_mounts > 0 ||
3935                context->n_temporary_filesystems > 0 ||
3936                context->root_directory ||
3937                !strv_isempty(context->extension_directories) ||
3938                context->protect_system != PROTECT_SYSTEM_NO ||
3939                context->protect_home != PROTECT_HOME_NO ||
3940                context->protect_kernel_tunables ||
3941                context->protect_kernel_modules ||
3942                context->protect_kernel_logs ||
3943                context->protect_control_groups ||
3944                context->protect_clock ||
3945                context->protect_hostname ||
3946                !strv_isempty(context->read_write_paths) ||
3947                !strv_isempty(context->read_only_paths) ||
3948                !strv_isempty(context->inaccessible_paths) ||
3949                !strv_isempty(context->exec_paths) ||
3950                !strv_isempty(context->no_exec_paths);
3951 }
3952
3953 static int exec_child(
3954                 Unit *unit,
3955                 const ExecCommand *command,
3956                 const ExecContext *context,
3957                 const ExecParameters *params,
3958                 ExecRuntime *runtime,
3959                 const CGroupContext *cgroup_context,
3960                 int socket_fd,
3961                 const int named_iofds[static 3],
3962                 int *params_fds,
3963                 size_t n_socket_fds,
3964                 size_t n_storage_fds,
3965                 char **files_env,
3966                 int user_lookup_fd,
3967                 int *exit_status) {
3968
3969         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
3970         int r, ngids = 0, exec_fd;
3971         _cleanup_free_ gid_t *supplementary_gids = NULL;
3972         const char *username = NULL, *groupname = NULL;
3973         _cleanup_free_ char *home_buffer = NULL, *memory_pressure_path = NULL;
3974         const char *home = NULL, *shell = NULL;
3975         char **final_argv = NULL;
3976         dev_t journal_stream_dev = 0;
3977         ino_t journal_stream_ino = 0;
3978         bool userns_set_up = false;
3979         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
3980                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
3981                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
3982                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
3983 #if HAVE_SELINUX
3984         _cleanup_free_ char *mac_selinux_context_net = NULL;
3985         bool use_selinux = false;
3986 #endif
3987 #if ENABLE_SMACK
3988         bool use_smack = false;
3989 #endif
3990 #if HAVE_APPARMOR
3991         bool use_apparmor = false;
3992 #endif
3993         uid_t saved_uid = getuid();
3994         gid_t saved_gid = getgid();
3995         uid_t uid = UID_INVALID;
3996         gid_t gid = GID_INVALID;
3997         size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
3998                n_keep_fds; /* total number of fds not to close */
3999         int secure_bits;
4000         _cleanup_free_ gid_t *gids_after_pam = NULL;
4001         int ngids_after_pam = 0;
4002         _cleanup_free_ int *fds = NULL;
4003         _cleanup_strv_free_ char **fdnames = NULL;
4004
4005         assert(unit);
4006         assert(command);
4007         assert(context);
4008         assert(params);
4009         assert(exit_status);
4010
4011         /* Explicitly test for CVE-2021-4034 inspired invocations */
4012         assert(command->path);
4013         assert(!strv_isempty(command->argv));
4014
4015         rename_process_from_path(command->path);
4016
4017         /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4018          * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4019          * both of which will be demoted to SIG_DFL. */
4020         (void) default_signals(SIGNALS_CRASH_HANDLER,
4021                                SIGNALS_IGNORE);
4022
4023         if (context->ignore_sigpipe)
4024                 (void) ignore_signals(SIGPIPE);
4025
4026         r = reset_signal_mask();
4027         if (r < 0) {
4028                 *exit_status = EXIT_SIGNAL_MASK;
4029                 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
4030         }
4031
4032         if (params->idle_pipe)
4033                 do_idle_pipe_dance(params->idle_pipe);
4034
4035         /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4036          * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4037          * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4038          * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
4039
4040         log_forget_fds();
4041         log_set_open_when_needed(true);
4042         log_settle_target();
4043
4044         /* In case anything used libc syslog(), close this here, too */
4045         closelog();
4046
4047         fds = newdup(int, params_fds, n_fds);
4048         if (!fds) {
4049                 *exit_status = EXIT_MEMORY;
4050                 return log_oom();
4051         }
4052
4053         fdnames = strv_copy((char**) params->fd_names);
4054         if (!fdnames) {
4055                 *exit_status = EXIT_MEMORY;
4056                 return log_oom();
4057         }
4058
4059         r = collect_open_file_fds(unit, params->open_files, &fds, &fdnames, &n_fds);
4060         if (r < 0) {
4061                 *exit_status = EXIT_FDS;
4062                 return log_unit_error_errno(unit, r, "Failed to get OpenFile= file descriptors: %m");
4063         }
4064
4065         int keep_fds[n_fds + 3];
4066         memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
4067         n_keep_fds = n_fds;
4068
4069         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
4070         if (r < 0) {
4071                 *exit_status = EXIT_FDS;
4072                 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4073         }
4074
4075 #if HAVE_LIBBPF
4076         if (unit->manager->restrict_fs) {
4077                 int bpf_map_fd = lsm_bpf_map_restrict_fs_fd(unit);
4078                 if (bpf_map_fd < 0) {
4079                         *exit_status = EXIT_FDS;
4080                         return log_unit_error_errno(unit, bpf_map_fd, "Failed to get restrict filesystems BPF map fd: %m");
4081                 }
4082
4083                 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, bpf_map_fd, &bpf_map_fd);
4084                 if (r < 0) {
4085                         *exit_status = EXIT_FDS;
4086                         return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4087                 }
4088         }
4089 #endif
4090
4091         r = close_remaining_fds(params, runtime, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
4092         if (r < 0) {
4093                 *exit_status = EXIT_FDS;
4094                 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
4095         }
4096
4097         if (!context->same_pgrp &&
4098             setsid() < 0) {
4099                 *exit_status = EXIT_SETSID;
4100                 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
4101         }
4102
4103         exec_context_tty_reset(context, params);
4104
4105         if (unit_shall_confirm_spawn(unit)) {
4106                 _cleanup_free_ char *cmdline = NULL;
4107
4108                 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
4109                 if (!cmdline) {
4110                         *exit_status = EXIT_MEMORY;
4111                         return log_oom();
4112                 }
4113
4114                 r = ask_for_confirmation(context, params->confirm_spawn, unit, cmdline);
4115                 if (r != CONFIRM_EXECUTE) {
4116                         if (r == CONFIRM_PRETEND_SUCCESS) {
4117                                 *exit_status = EXIT_SUCCESS;
4118                                 return 0;
4119                         }
4120
4121                         *exit_status = EXIT_CONFIRM;
4122                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
4123                                                     "Execution cancelled by the user");
4124                 }
4125         }
4126
4127         /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4128          * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4129          * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4130          * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4131          * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4132         if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
4133             setenv("SYSTEMD_ACTIVATION_SCOPE", runtime_scope_to_string(params->runtime_scope), true) != 0) {
4134                 *exit_status = EXIT_MEMORY;
4135                 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4136         }
4137
4138         if (context->dynamic_user && runtime && runtime->dynamic_creds) {
4139                 _cleanup_strv_free_ char **suggested_paths = NULL;
4140
4141                 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
4142                  * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
4143                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4144                         *exit_status = EXIT_USER;
4145                         return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4146                 }
4147
4148                 r = compile_suggested_paths(context, params, &suggested_paths);
4149                 if (r < 0) {
4150                         *exit_status = EXIT_MEMORY;
4151                         return log_oom();
4152                 }
4153
4154                 r = dynamic_creds_realize(runtime->dynamic_creds, suggested_paths, &uid, &gid);
4155                 if (r < 0) {
4156                         *exit_status = EXIT_USER;
4157                         if (r == -EILSEQ)
4158                                 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4159                                                             "Failed to update dynamic user credentials: User or group with specified name already exists.");
4160                         return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
4161                 }
4162
4163                 if (!uid_is_valid(uid)) {
4164                         *exit_status = EXIT_USER;
4165                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
4166                 }
4167
4168                 if (!gid_is_valid(gid)) {
4169                         *exit_status = EXIT_USER;
4170                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
4171                 }
4172
4173                 if (runtime->dynamic_creds->user)
4174                         username = runtime->dynamic_creds->user->name;
4175
4176         } else {
4177                 if (context->user) {
4178                         r = get_fixed_user(context->user, &username, &uid, &gid, &home, &shell);
4179                         if (r < 0) {
4180                                 *exit_status = EXIT_USER;
4181                                 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
4182                         }
4183                 }
4184
4185                 if (context->group) {
4186                         r = get_fixed_group(context->group, &groupname, &gid);
4187                         if (r < 0) {
4188                                 *exit_status = EXIT_GROUP;
4189                                 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
4190                         }
4191                 }
4192         }
4193
4194         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4195         r = get_supplementary_groups(context, username, groupname, gid,
4196                                      &supplementary_gids, &ngids);
4197         if (r < 0) {
4198                 *exit_status = EXIT_GROUP;
4199                 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
4200         }
4201
4202         r = send_user_lookup(unit, user_lookup_fd, uid, gid);
4203         if (r < 0) {
4204                 *exit_status = EXIT_USER;
4205                 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
4206         }
4207
4208         user_lookup_fd = safe_close(user_lookup_fd);
4209
4210         r = acquire_home(context, uid, &home, &home_buffer);
4211         if (r < 0) {
4212                 *exit_status = EXIT_CHDIR;
4213                 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
4214         }
4215
4216         /* If a socket is connected to STDIN/STDOUT/STDERR, we must drop O_NONBLOCK */
4217         if (socket_fd >= 0)
4218                 (void) fd_nonblock(socket_fd, false);
4219
4220         /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4221          * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4222         if (params->cgroup_path) {
4223                 _cleanup_free_ char *p = NULL;
4224
4225                 r = exec_parameters_get_cgroup_path(params, cgroup_context, &p);
4226                 if (r < 0) {
4227                         *exit_status = EXIT_CGROUP;
4228                         return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
4229                 }
4230
4231                 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
4232                 if (r == -EUCLEAN) {
4233                         *exit_status = EXIT_CGROUP;
4234                         return log_unit_error_errno(unit, r, "Failed to attach process to cgroup %s "
4235                                                     "because the cgroup or one of its parents or "
4236                                                     "siblings is in the threaded mode: %m", p);
4237                 }
4238                 if (r < 0) {
4239                         *exit_status = EXIT_CGROUP;
4240                         return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
4241                 }
4242         }
4243
4244         if (context->network_namespace_path && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
4245                 r = open_shareable_ns_path(runtime->shared->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
4246                 if (r < 0) {
4247                         *exit_status = EXIT_NETWORK;
4248                         return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
4249                 }
4250         }
4251
4252         if (context->ipc_namespace_path && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
4253                 r = open_shareable_ns_path(runtime->shared->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
4254                 if (r < 0) {
4255                         *exit_status = EXIT_NAMESPACE;
4256                         return log_unit_error_errno(unit, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
4257                 }
4258         }
4259
4260         r = setup_input(context, params, socket_fd, named_iofds);
4261         if (r < 0) {
4262                 *exit_status = EXIT_STDIN;
4263                 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
4264         }
4265
4266         r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4267         if (r < 0) {
4268                 *exit_status = EXIT_STDOUT;
4269                 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
4270         }
4271
4272         r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4273         if (r < 0) {
4274                 *exit_status = EXIT_STDERR;
4275                 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
4276         }
4277
4278         if (context->oom_score_adjust_set) {
4279                 /* When we can't make this change due to EPERM, then let's silently skip over it. User
4280                  * namespaces prohibit write access to this file, and we shouldn't trip up over that. */
4281                 r = set_oom_score_adjust(context->oom_score_adjust);
4282                 if (ERRNO_IS_NEG_PRIVILEGE(r))
4283                         log_unit_debug_errno(unit, r,
4284                                              "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
4285                 else if (r < 0) {
4286                         *exit_status = EXIT_OOM_ADJUST;
4287                         return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
4288                 }
4289         }
4290
4291         if (context->coredump_filter_set) {
4292                 r = set_coredump_filter(context->coredump_filter);
4293                 if (ERRNO_IS_NEG_PRIVILEGE(r))
4294                         log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
4295                 else if (r < 0) {
4296                         *exit_status = EXIT_LIMITS;
4297                         return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
4298                 }
4299         }
4300
4301         if (context->nice_set) {
4302                 r = setpriority_closest(context->nice);
4303                 if (r < 0) {
4304                         *exit_status = EXIT_NICE;
4305                         return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
4306                 }
4307         }
4308
4309         if (context->cpu_sched_set) {
4310                 struct sched_param param = {
4311                         .sched_priority = context->cpu_sched_priority,
4312                 };
4313
4314                 r = sched_setscheduler(0,
4315                                        context->cpu_sched_policy |
4316                                        (context->cpu_sched_reset_on_fork ?
4317                                         SCHED_RESET_ON_FORK : 0),
4318                                        &param);
4319                 if (r < 0) {
4320                         *exit_status = EXIT_SETSCHEDULER;
4321                         return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
4322                 }
4323         }
4324
4325         if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4326                 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4327                 const CPUSet *cpu_set;
4328
4329                 if (context->cpu_affinity_from_numa) {
4330                         r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4331                         if (r < 0) {
4332                                 *exit_status = EXIT_CPUAFFINITY;
4333                                 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4334                         }
4335
4336                         cpu_set = &converted_cpu_set;
4337                 } else
4338                         cpu_set = &context->cpu_set;
4339
4340                 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
4341                         *exit_status = EXIT_CPUAFFINITY;
4342                         return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
4343                 }
4344         }
4345
4346         if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4347                 r = apply_numa_policy(&context->numa_policy);
4348                 if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
4349                         log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
4350                 else if (r < 0) {
4351                         *exit_status = EXIT_NUMA_POLICY;
4352                         return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
4353                 }
4354         }
4355
4356         if (context->ioprio_set)
4357                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
4358                         *exit_status = EXIT_IOPRIO;
4359                         return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
4360                 }
4361
4362         if (context->timer_slack_nsec != NSEC_INFINITY)
4363                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
4364                         *exit_status = EXIT_TIMERSLACK;
4365                         return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
4366                 }
4367
4368         if (context->personality != PERSONALITY_INVALID) {
4369                 r = safe_personality(context->personality);
4370                 if (r < 0) {
4371                         *exit_status = EXIT_PERSONALITY;
4372                         return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
4373                 }
4374         }
4375
4376         if (context->utmp_id) {
4377                 const char *line = context->tty_path ?
4378                         (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
4379                         NULL;
4380                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
4381                                       line,
4382                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
4383                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4384                                       USER_PROCESS,
4385                                       username);
4386         }
4387
4388         if (uid_is_valid(uid)) {
4389                 r = chown_terminal(STDIN_FILENO, uid);
4390                 if (r < 0) {
4391                         *exit_status = EXIT_STDIN;
4392                         return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
4393                 }
4394         }
4395
4396         if (params->cgroup_path) {
4397                 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
4398                  * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4399                  * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
4400                  * touch a single hierarchy too. */
4401
4402                 if (params->flags & EXEC_CGROUP_DELEGATE) {
4403                         _cleanup_free_ char *p = NULL;
4404
4405                         r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
4406                         if (r < 0) {
4407                                 *exit_status = EXIT_CGROUP;
4408                                 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
4409                         }
4410
4411                         r = exec_parameters_get_cgroup_path(params, cgroup_context, &p);
4412                         if (r < 0) {
4413                                 *exit_status = EXIT_CGROUP;
4414                                 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
4415                         }
4416                         if (r > 0) {
4417                                 r = cg_set_access_recursive(SYSTEMD_CGROUP_CONTROLLER, p, uid, gid);
4418                                 if (r < 0) {
4419                                         *exit_status = EXIT_CGROUP;
4420                                         return log_unit_error_errno(unit, r, "Failed to adjust control subgroup access: %m");
4421                                 }
4422                         }
4423                 }
4424
4425                 if (cgroup_context && cg_unified() > 0 && is_pressure_supported() > 0) {
4426                         if (cgroup_context_want_memory_pressure(cgroup_context)) {
4427                                 r = cg_get_path("memory", params->cgroup_path, "memory.pressure", &memory_pressure_path);
4428                                 if (r < 0) {
4429                                         *exit_status = EXIT_MEMORY;
4430                                         return log_oom();
4431                                 }
4432
4433                                 r = chmod_and_chown(memory_pressure_path, 0644, uid, gid);
4434                                 if (r < 0) {
4435                                         log_unit_full_errno(unit, r == -ENOENT || ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
4436                                                             "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path);
4437                                         memory_pressure_path = mfree(memory_pressure_path);
4438                                 }
4439                         } else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_OFF) {
4440                                 memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
4441                                 if (!memory_pressure_path) {
4442                                         *exit_status = EXIT_MEMORY;
4443                                         return log_oom();
4444                                 }
4445                         }
4446                 }
4447         }
4448
4449         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
4450
4451         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4452                 r = setup_exec_directory(unit, context, params, uid, gid, dt, needs_mount_namespace, exit_status);
4453                 if (r < 0)
4454                         return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
4455         }
4456
4457         if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4458                 r = exec_setup_credentials(context, params, unit->id, uid, gid);
4459                 if (r < 0) {
4460                         *exit_status = EXIT_CREDENTIALS;
4461                         return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
4462                 }
4463         }
4464
4465         r = build_environment(
4466                         unit,
4467                         context,
4468                         params,
4469                         cgroup_context,
4470                         n_fds,
4471                         fdnames,
4472                         home,
4473                         username,
4474                         shell,
4475                         journal_stream_dev,
4476                         journal_stream_ino,
4477                         memory_pressure_path,
4478                         &our_env);
4479         if (r < 0) {
4480                 *exit_status = EXIT_MEMORY;
4481                 return log_oom();
4482         }
4483
4484         r = build_pass_environment(context, &pass_env);
4485         if (r < 0) {
4486                 *exit_status = EXIT_MEMORY;
4487                 return log_oom();
4488         }
4489
4490         /* The $PATH variable is set to the default path in params->environment. However, this is overridden
4491          * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
4492          * not specify PATH but the unit has ExecSearchPath. */
4493         if (!strv_isempty(context->exec_search_path)) {
4494                 _cleanup_free_ char *joined = NULL;
4495
4496                 joined = strv_join(context->exec_search_path, ":");
4497                 if (!joined) {
4498                         *exit_status = EXIT_MEMORY;
4499                         return log_oom();
4500                 }
4501
4502                 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
4503                 if (r < 0) {
4504                         *exit_status = EXIT_MEMORY;
4505                         return log_oom();
4506                 }
4507         }
4508
4509         accum_env = strv_env_merge(params->environment,
4510                                    our_env,
4511                                    joined_exec_search_path,
4512                                    pass_env,
4513                                    context->environment,
4514                                    files_env);
4515         if (!accum_env) {
4516                 *exit_status = EXIT_MEMORY;
4517                 return log_oom();
4518         }
4519         accum_env = strv_env_clean(accum_env);
4520
4521         (void) umask(context->umask);
4522
4523         r = setup_keyring(unit, context, params, uid, gid);
4524         if (r < 0) {
4525                 *exit_status = EXIT_KEYRING;
4526                 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
4527         }
4528
4529         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
4530          * from it. */
4531         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
4532
4533         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
4534          * for it, and the kernel doesn't actually support ambient caps. */
4535         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
4536
4537         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
4538          * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
4539          * desired. */
4540         if (needs_ambient_hack)
4541                 needs_setuid = false;
4542         else
4543                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4544
4545         uint64_t capability_ambient_set = context->capability_ambient_set;
4546
4547         if (needs_sandboxing) {
4548                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
4549                  * /sys being present. The actual MAC context application will happen later, as late as
4550                  * possible, to avoid impacting our own code paths. */
4551
4552 #if HAVE_SELINUX
4553                 use_selinux = mac_selinux_use();
4554 #endif
4555 #if ENABLE_SMACK
4556                 use_smack = mac_smack_use();
4557 #endif
4558 #if HAVE_APPARMOR
4559                 use_apparmor = mac_apparmor_use();
4560 #endif
4561         }
4562
4563         if (needs_sandboxing) {
4564                 int which_failed;
4565
4566                 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4567                  * is set here. (See below.) */
4568
4569                 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4570                 if (r < 0) {
4571                         *exit_status = EXIT_LIMITS;
4572                         return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4573                 }
4574         }
4575
4576         if (needs_setuid && context->pam_name && username) {
4577                 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
4578                  * wins here. (See above.) */
4579
4580                 /* All fds passed in the fds array will be closed in the pam child process. */
4581                 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
4582                 if (r < 0) {
4583                         *exit_status = EXIT_PAM;
4584                         return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
4585                 }
4586
4587                 if (ambient_capabilities_supported()) {
4588                         uint64_t ambient_after_pam;
4589
4590                         /* PAM modules might have set some ambient caps. Query them here and merge them into
4591                          * the caps we want to set in the end, so that we don't end up unsetting them. */
4592                         r = capability_get_ambient(&ambient_after_pam);
4593                         if (r < 0) {
4594                                 *exit_status = EXIT_CAPABILITIES;
4595                                 return log_unit_error_errno(unit, r, "Failed to query ambient caps: %m");
4596                         }
4597
4598                         capability_ambient_set |= ambient_after_pam;
4599                 }
4600
4601                 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4602                 if (ngids_after_pam < 0) {
4603                         *exit_status = EXIT_MEMORY;
4604                         return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
4605                 }
4606         }
4607
4608         if (needs_sandboxing && exec_context_need_unprivileged_private_users(context, params)) {
4609                 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4610                  * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4611                  * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
4612
4613                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4614                 /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
4615                  * the actual requested operations fail (or silently continue). */
4616                 if (r < 0 && context->private_users) {
4617                         *exit_status = EXIT_USER;
4618                         return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
4619                 }
4620                 if (r < 0)
4621                         log_unit_info_errno(unit, r, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
4622                 else
4623                         userns_set_up = true;
4624         }
4625
4626         if (exec_needs_network_namespace(context) && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
4627
4628                 /* Try to enable network namespacing if network namespacing is available and we have
4629                  * CAP_NET_ADMIN. We need CAP_NET_ADMIN to be able to configure the loopback device in the
4630                  * new network namespace. And if we don't have that, then we could only create a network
4631                  * namespace without the ability to set up "lo". Hence gracefully skip things then. */
4632                 if (ns_type_supported(NAMESPACE_NET) && have_effective_cap(CAP_NET_ADMIN) > 0) {
4633                         r = setup_shareable_ns(runtime->shared->netns_storage_socket, CLONE_NEWNET);
4634                         if (ERRNO_IS_NEG_PRIVILEGE(r))
4635                                 log_unit_notice_errno(unit, r,
4636                                                       "PrivateNetwork=yes is configured, but network namespace setup not permitted, proceeding without: %m");
4637                         else if (r < 0) {
4638                                 *exit_status = EXIT_NETWORK;
4639                                 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
4640                         }
4641                 } else if (context->network_namespace_path) {
4642                         *exit_status = EXIT_NETWORK;
4643                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4644                                                     "NetworkNamespacePath= is not supported, refusing.");
4645                 } else
4646                         log_unit_notice(unit, "PrivateNetwork=yes is configured, but the kernel does not support or we lack privileges for network namespace, proceeding without.");
4647         }
4648
4649         if (exec_needs_ipc_namespace(context) && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
4650
4651                 if (ns_type_supported(NAMESPACE_IPC)) {
4652                         r = setup_shareable_ns(runtime->shared->ipcns_storage_socket, CLONE_NEWIPC);
4653                         if (r == -EPERM)
4654                                 log_unit_warning_errno(unit, r,
4655                                                        "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4656                         else if (r < 0) {
4657                                 *exit_status = EXIT_NAMESPACE;
4658                                 return log_unit_error_errno(unit, r, "Failed to set up IPC namespacing: %m");
4659                         }
4660                 } else if (context->ipc_namespace_path) {
4661                         *exit_status = EXIT_NAMESPACE;
4662                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4663                                                     "IPCNamespacePath= is not supported, refusing.");
4664                 } else
4665                         log_unit_warning(unit, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4666         }
4667
4668         if (needs_mount_namespace) {
4669                 _cleanup_free_ char *error_path = NULL;
4670
4671                 r = apply_mount_namespace(unit, command->flags, context, params, runtime, memory_pressure_path, &error_path);
4672                 if (r < 0) {
4673                         *exit_status = EXIT_NAMESPACE;
4674                         return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
4675                                                     error_path ? ": " : "", strempty(error_path));
4676                 }
4677         }
4678
4679         if (needs_sandboxing) {
4680                 r = apply_protect_hostname(unit, context, exit_status);
4681                 if (r < 0)
4682                         return r;
4683         }
4684
4685         if (context->memory_ksm >= 0)
4686                 if (prctl(PR_SET_MEMORY_MERGE, context->memory_ksm) < 0) {
4687                         if (ERRNO_IS_NOT_SUPPORTED(errno))
4688                                 log_unit_debug_errno(unit, errno, "KSM support not available, ignoring.");
4689                         else {
4690                                 *exit_status = EXIT_KSM;
4691                                 return log_unit_error_errno(unit, errno, "Failed to set KSM: %m");
4692                         }
4693                 }
4694
4695         /* Drop groups as early as possible.
4696          * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
4697          * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
4698         if (needs_setuid) {
4699                 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4700                 int ngids_to_enforce = 0;
4701
4702                 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4703                                                    ngids,
4704                                                    gids_after_pam,
4705                                                    ngids_after_pam,
4706                                                    &gids_to_enforce);
4707                 if (ngids_to_enforce < 0) {
4708                         *exit_status = EXIT_MEMORY;
4709                         return log_unit_error_errno(unit,
4710                                                     ngids_to_enforce,
4711                                                     "Failed to merge group lists. Group membership might be incorrect: %m");
4712                 }
4713
4714                 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
4715                 if (r < 0) {
4716                         *exit_status = EXIT_GROUP;
4717                         return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
4718                 }
4719         }
4720
4721         /* If the user namespace was not set up above, try to do it now.
4722          * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4723          * restricted by rules pertaining to combining user namespaces with other namespaces (e.g. in the
4724          * case of mount namespaces being less privileged when the mount point list is copied from a
4725          * different user namespace). */
4726
4727         if (needs_sandboxing && context->private_users && !userns_set_up) {
4728                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4729                 if (r < 0) {
4730                         *exit_status = EXIT_USER;
4731                         return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
4732                 }
4733         }
4734
4735         /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4736          * shall execute. */
4737
4738         _cleanup_free_ char *executable = NULL;
4739         _cleanup_close_ int executable_fd = -EBADF;
4740         r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
4741         if (r < 0) {
4742                 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
4743                         log_unit_struct_errno(unit, LOG_INFO, r,
4744                                               "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4745                                               LOG_UNIT_INVOCATION_ID(unit),
4746                                               LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
4747                                                                command->path),
4748                                               "EXECUTABLE=%s", command->path);
4749                         *exit_status = EXIT_SUCCESS;
4750                         return 0;
4751                 }
4752
4753                 *exit_status = EXIT_EXEC;
4754                 return log_unit_struct_errno(unit, LOG_INFO, r,
4755                                              "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4756                                              LOG_UNIT_INVOCATION_ID(unit),
4757                                              LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
4758                                                               command->path),
4759                                              "EXECUTABLE=%s", command->path);
4760         }
4761
4762         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
4763         if (r < 0) {
4764                 *exit_status = EXIT_FDS;
4765                 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4766         }
4767
4768 #if HAVE_SELINUX
4769         if (needs_sandboxing && use_selinux && params->selinux_context_net) {
4770                 int fd = -EBADF;
4771
4772                 if (socket_fd >= 0)
4773                         fd = socket_fd;
4774                 else if (params->n_socket_fds == 1)
4775                         /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4776                          * use context from that fd to compute the label. */
4777                         fd = params->fds[0];
4778
4779                 if (fd >= 0) {
4780                         r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
4781                         if (r < 0) {
4782                                 if (!context->selinux_context_ignore) {
4783                                         *exit_status = EXIT_SELINUX_CONTEXT;
4784                                         return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
4785                                 }
4786                                 log_unit_debug_errno(unit, r, "Failed to determine SELinux context, ignoring: %m");
4787                         }
4788                 }
4789         }
4790 #endif
4791
4792         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that
4793          * we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any
4794          * more. We do keep exec_fd however, if we have it, since we need to keep it open until the final
4795          * execve(). */
4796
4797         r = close_all_fds(keep_fds, n_keep_fds);
4798         if (r >= 0)
4799                 r = shift_fds(fds, n_fds);
4800         if (r >= 0)
4801                 r = flags_fds(fds, n_socket_fds, n_fds, context->non_blocking);
4802         if (r < 0) {
4803                 *exit_status = EXIT_FDS;
4804                 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
4805         }
4806
4807         /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4808          * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4809          * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4810          * came this far. */
4811
4812         secure_bits = context->secure_bits;
4813
4814         if (needs_sandboxing) {
4815                 uint64_t bset;
4816
4817                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested.
4818                  * (Note this is placed after the general resource limit initialization, see above, in order
4819                  * to take precedence.) */
4820                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
4821                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
4822                                 *exit_status = EXIT_LIMITS;
4823                                 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
4824                         }
4825                 }
4826
4827 #if ENABLE_SMACK
4828                 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4829                  * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4830                 if (use_smack) {
4831                         r = setup_smack(unit->manager, context, executable_fd);
4832                         if (r < 0 && !context->smack_process_label_ignore) {
4833                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
4834                                 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
4835                         }
4836                 }
4837 #endif
4838
4839                 bset = context->capability_bounding_set;
4840                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4841                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4842                  * instead of us doing that */
4843                 if (needs_ambient_hack)
4844                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
4845                                 (UINT64_C(1) << CAP_SETUID) |
4846                                 (UINT64_C(1) << CAP_SETGID);
4847
4848                 if (!cap_test_all(bset)) {
4849                         r = capability_bounding_set_drop(bset, /* right_now= */ false);
4850                         if (r < 0) {
4851                                 *exit_status = EXIT_CAPABILITIES;
4852                                 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
4853                         }
4854                 }
4855
4856                 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4857                  * keep-caps set.
4858                  *
4859                  * To be able to raise the ambient capabilities after setresuid() they have to be added to
4860                  * the inherited set and keep caps has to be set (done in enforce_user()).  After setresuid()
4861                  * the ambient capabilities can be raised as they are present in the permitted and
4862                  * inhertiable set. However it is possible that someone wants to set ambient capabilities
4863                  * without changing the user, so we also set the ambient capabilities here.
4864                  *
4865                  * The requested ambient capabilities are raised in the inheritable set if the second
4866                  * argument is true. */
4867                 if (!needs_ambient_hack) {
4868                         r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ true);
4869                         if (r < 0) {
4870                                 *exit_status = EXIT_CAPABILITIES;
4871                                 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
4872                         }
4873                 }
4874         }
4875
4876         /* chroot to root directory first, before we lose the ability to chroot */
4877         r = apply_root_directory(context, params, runtime, needs_mount_namespace, exit_status);
4878         if (r < 0)
4879                 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
4880
4881         if (needs_setuid) {
4882                 if (uid_is_valid(uid)) {
4883                         r = enforce_user(context, uid, capability_ambient_set);
4884                         if (r < 0) {
4885                                 *exit_status = EXIT_USER;
4886                                 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
4887                         }
4888
4889                         if (!needs_ambient_hack && capability_ambient_set != 0) {
4890
4891                                 /* Raise the ambient capabilities after user change. */
4892                                 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ false);
4893                                 if (r < 0) {
4894                                         *exit_status = EXIT_CAPABILITIES;
4895                                         return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
4896                                 }
4897                         }
4898                 }
4899         }
4900
4901         /* Apply working directory here, because the working directory might be on NFS and only the user running
4902          * this service might have the correct privilege to change to the working directory */
4903         r = apply_working_directory(context, params, runtime, home, exit_status);
4904         if (r < 0)
4905                 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
4906
4907         if (needs_sandboxing) {
4908                 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
4909                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
4910                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
4911                  * are restricted. */
4912
4913 #if HAVE_SELINUX
4914                 if (use_selinux) {
4915                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
4916
4917                         if (exec_context) {
4918                                 r = setexeccon(exec_context);
4919                                 if (r < 0) {
4920                                         if (!context->selinux_context_ignore) {
4921                                                 *exit_status = EXIT_SELINUX_CONTEXT;
4922                                                 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
4923                                         }
4924                                         log_unit_debug_errno(unit, r, "Failed to change SELinux context to %s, ignoring: %m", exec_context);
4925                                 }
4926                         }
4927                 }
4928 #endif
4929
4930 #if HAVE_APPARMOR
4931                 if (use_apparmor && context->apparmor_profile) {
4932                         r = aa_change_onexec(context->apparmor_profile);
4933                         if (r < 0 && !context->apparmor_profile_ignore) {
4934                                 *exit_status = EXIT_APPARMOR_PROFILE;
4935                                 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
4936                         }
4937                 }
4938 #endif
4939
4940                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential
4941                  * EPERMs we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits
4942                  * requires CAP_SETPCAP. */
4943                 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
4944                         /* CAP_SETPCAP is required to set securebits. This capability is raised into the
4945                          * effective set here.
4946                          *
4947                          * The effective set is overwritten during execve() with the following values:
4948                          *
4949                          * - ambient set (for non-root processes)
4950                          *
4951                          * - (inheritable | bounding) set for root processes)
4952                          *
4953                          * Hence there is no security impact to raise it in the effective set before execve
4954                          */
4955                         r = capability_gain_cap_setpcap(/* return_caps= */ NULL);
4956                         if (r < 0) {
4957                                 *exit_status = EXIT_CAPABILITIES;
4958                                 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
4959                         }
4960                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
4961                                 *exit_status = EXIT_SECUREBITS;
4962                                 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
4963                         }
4964                 }
4965
4966                 if (context_has_no_new_privileges(context))
4967                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
4968                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
4969                                 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
4970                         }
4971
4972 #if HAVE_SECCOMP
4973                 r = apply_address_families(unit, context);
4974                 if (r < 0) {
4975                         *exit_status = EXIT_ADDRESS_FAMILIES;
4976                         return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
4977                 }
4978
4979                 r = apply_memory_deny_write_execute(unit, context);
4980                 if (r < 0) {
4981                         *exit_status = EXIT_SECCOMP;
4982                         return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
4983                 }
4984
4985                 r = apply_restrict_realtime(unit, context);
4986                 if (r < 0) {
4987                         *exit_status = EXIT_SECCOMP;
4988                         return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
4989                 }
4990
4991                 r = apply_restrict_suid_sgid(unit, context);
4992                 if (r < 0) {
4993                         *exit_status = EXIT_SECCOMP;
4994                         return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
4995                 }
4996
4997                 r = apply_restrict_namespaces(unit, context);
4998                 if (r < 0) {
4999                         *exit_status = EXIT_SECCOMP;
5000                         return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
5001                 }
5002
5003                 r = apply_protect_sysctl(unit, context);
5004                 if (r < 0) {
5005                         *exit_status = EXIT_SECCOMP;
5006                         return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
5007                 }
5008
5009                 r = apply_protect_kernel_modules(unit, context);
5010                 if (r < 0) {
5011                         *exit_status = EXIT_SECCOMP;
5012                         return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
5013                 }
5014
5015                 r = apply_protect_kernel_logs(unit, context);
5016                 if (r < 0) {
5017                         *exit_status = EXIT_SECCOMP;
5018                         return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
5019                 }
5020
5021                 r = apply_protect_clock(unit, context);
5022                 if (r < 0) {
5023                         *exit_status = EXIT_SECCOMP;
5024                         return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
5025                 }
5026
5027                 r = apply_private_devices(unit, context);
5028                 if (r < 0) {
5029                         *exit_status = EXIT_SECCOMP;
5030                         return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
5031                 }
5032
5033                 r = apply_syscall_archs(unit, context);
5034                 if (r < 0) {
5035                         *exit_status = EXIT_SECCOMP;
5036                         return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
5037                 }
5038
5039                 r = apply_lock_personality(unit, context);
5040                 if (r < 0) {
5041                         *exit_status = EXIT_SECCOMP;
5042                         return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
5043                 }
5044
5045                 r = apply_syscall_log(unit, context);
5046                 if (r < 0) {
5047                         *exit_status = EXIT_SECCOMP;
5048                         return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
5049                 }
5050
5051                 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
5052                  * by the filter as little as possible. */
5053                 r = apply_syscall_filter(unit, context, needs_ambient_hack);
5054                 if (r < 0) {
5055                         *exit_status = EXIT_SECCOMP;
5056                         return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
5057                 }
5058 #endif
5059
5060 #if HAVE_LIBBPF
5061                 r = apply_restrict_filesystems(unit, context);
5062                 if (r < 0) {
5063                         *exit_status = EXIT_BPF;
5064                         return log_unit_error_errno(unit, r, "Failed to restrict filesystems: %m");
5065                 }
5066 #endif
5067
5068         }
5069
5070         if (!strv_isempty(context->unset_environment)) {
5071                 char **ee = NULL;
5072
5073                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
5074                 if (!ee) {
5075                         *exit_status = EXIT_MEMORY;
5076                         return log_oom();
5077                 }
5078
5079                 strv_free_and_replace(accum_env, ee);
5080         }
5081
5082         if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
5083                 _cleanup_strv_free_ char **unset_variables = NULL, **bad_variables = NULL;
5084
5085                 r = replace_env_argv(command->argv, accum_env, &replaced_argv, &unset_variables, &bad_variables);
5086                 if (r < 0) {
5087                         *exit_status = EXIT_MEMORY;
5088                         return log_unit_error_errno(unit, r, "Failed to replace environment variables: %m");
5089                 }
5090                 final_argv = replaced_argv;
5091
5092                 if (!strv_isempty(unset_variables)) {
5093                         _cleanup_free_ char *ju = strv_join(unset_variables, ", ");
5094                         log_unit_warning(unit, "Referenced but unset environment variable evaluates to an empty string: %s", strna(ju));
5095                 }
5096
5097                 if (!strv_isempty(bad_variables)) {
5098                         _cleanup_free_ char *jb = strv_join(bad_variables, ", ");
5099                         log_unit_warning(unit, "Invalid environment variable name evaluates to an empty string: %s", strna(jb));;
5100                 }
5101         } else
5102                 final_argv = command->argv;
5103
5104         log_command_line(unit, "Executing", executable, final_argv);
5105
5106         if (exec_fd >= 0) {
5107                 uint8_t hot = 1;
5108
5109                 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5110                  * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5111
5112                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5113                         *exit_status = EXIT_EXEC;
5114                         return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
5115                 }
5116         }
5117
5118         r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5119
5120         if (exec_fd >= 0) {
5121                 uint8_t hot = 0;
5122
5123                 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5124                  * that POLLHUP on it no longer means execve() succeeded. */
5125
5126                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5127                         *exit_status = EXIT_EXEC;
5128                         return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
5129                 }
5130         }
5131
5132         *exit_status = EXIT_EXEC;
5133         return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
5134 }
5135
5136 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
5137 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
5138
5139 int exec_spawn(Unit *unit,
5140                ExecCommand *command,
5141                const ExecContext *context,
5142                const ExecParameters *params,
5143                ExecRuntime *runtime,
5144                const CGroupContext *cgroup_context,
5145                pid_t *ret) {
5146
5147         int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
5148         _cleanup_free_ char *subcgroup_path = NULL;
5149         _cleanup_strv_free_ char **files_env = NULL;
5150         size_t n_storage_fds = 0, n_socket_fds = 0;
5151         pid_t pid;
5152
5153         assert(unit);
5154         assert(command);
5155         assert(context);
5156         assert(ret);
5157         assert(params);
5158         assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
5159
5160         LOG_CONTEXT_PUSH_UNIT(unit);
5161
5162         if (context->std_input == EXEC_INPUT_SOCKET ||
5163             context->std_output == EXEC_OUTPUT_SOCKET ||
5164             context->std_error == EXEC_OUTPUT_SOCKET) {
5165
5166                 if (params->n_socket_fds > 1)
5167                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
5168
5169                 if (params->n_socket_fds == 0)
5170                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
5171
5172                 socket_fd = params->fds[0];
5173         } else {
5174                 socket_fd = -EBADF;
5175                 fds = params->fds;
5176                 n_socket_fds = params->n_socket_fds;
5177                 n_storage_fds = params->n_storage_fds;
5178         }
5179
5180         r = exec_context_named_iofds(context, params, named_iofds);
5181         if (r < 0)
5182                 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
5183
5184         r = exec_context_load_environment(unit, context, &files_env);
5185         if (r < 0)
5186                 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
5187
5188         /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
5189            and, until the next SELinux policy changes, we save further reloads in future children. */
5190         mac_selinux_maybe_reload();
5191
5192         /* We won't know the real executable path until we create the mount namespace in the child, but we
5193            want to log from the parent, so we use the possibly inaccurate path here. */
5194         log_command_line(unit, "About to execute", command->path, command->argv);
5195
5196         if (params->cgroup_path) {
5197                 r = exec_parameters_get_cgroup_path(params, cgroup_context, &subcgroup_path);
5198                 if (r < 0)
5199                         return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
5200                 if (r > 0) {
5201                         /* If there's a subcgroup, then let's create it here now (the main cgroup was already
5202                          * realized by the unit logic) */
5203
5204                         r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
5205                         if (r < 0)
5206                                 return log_unit_error_errno(unit, r, "Failed to create subcgroup '%s': %m", subcgroup_path);
5207                 }
5208         }
5209
5210         pid = fork();
5211         if (pid < 0)
5212                 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
5213
5214         if (pid == 0) {
5215                 int exit_status;
5216
5217                 r = exec_child(unit,
5218                                command,
5219                                context,
5220                                params,
5221                                runtime,
5222                                cgroup_context,
5223                                socket_fd,
5224                                named_iofds,
5225                                fds,
5226                                n_socket_fds,
5227                                n_storage_fds,
5228                                files_env,
5229                                unit->manager->user_lookup_fds[1],
5230                                &exit_status);
5231
5232                 if (r < 0) {
5233                         const char *status = ASSERT_PTR(
5234                                         exit_status_to_string(exit_status, EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD));
5235
5236                         log_unit_struct_errno(unit, LOG_ERR, r,
5237                                               "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5238                                               LOG_UNIT_INVOCATION_ID(unit),
5239                                               LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
5240                                                                status, command->path),
5241                                               "EXECUTABLE=%s", command->path);
5242                 } else
5243                         assert(exit_status == EXIT_SUCCESS);
5244
5245                 _exit(exit_status);
5246         }
5247
5248         log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
5249
5250         /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
5251          * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
5252          * process will be killed too). */
5253         if (subcgroup_path)
5254                 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
5255
5256         exec_status_start(&command->exec_status, pid);
5257
5258         *ret = pid;
5259         return 0;
5260 }
5261
5262 void exec_context_init(ExecContext *c) {
5263         assert(c);
5264
5265         *c = (ExecContext) {
5266                 .umask = 0022,
5267                 .ioprio = IOPRIO_DEFAULT_CLASS_AND_PRIO,
5268                 .cpu_sched_policy = SCHED_OTHER,
5269                 .syslog_priority = LOG_DAEMON|LOG_INFO,
5270                 .syslog_level_prefix = true,
5271                 .ignore_sigpipe = true,
5272                 .timer_slack_nsec = NSEC_INFINITY,
5273                 .personality = PERSONALITY_INVALID,
5274                 .timeout_clean_usec = USEC_INFINITY,
5275                 .capability_bounding_set = CAP_MASK_UNSET,
5276                 .restrict_namespaces = NAMESPACE_FLAGS_INITIAL,
5277                 .log_level_max = -1,
5278 #if HAVE_SECCOMP
5279                 .syscall_errno = SECCOMP_ERROR_NUMBER_KILL,
5280 #endif
5281                 .tty_rows = UINT_MAX,
5282                 .tty_cols = UINT_MAX,
5283                 .private_mounts = -1,
5284                 .memory_ksm = -1,
5285                 .set_login_environment = -1,
5286         };
5287
5288         FOREACH_ARRAY(d, c->directories, _EXEC_DIRECTORY_TYPE_MAX)
5289                 d->mode = 0755;
5290
5291         numa_policy_reset(&c->numa_policy);
5292
5293         assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
5294 }
5295
5296 void exec_context_done(ExecContext *c) {
5297         assert(c);
5298
5299         c->environment = strv_free(c->environment);
5300         c->environment_files = strv_free(c->environment_files);
5301         c->pass_environment = strv_free(c->pass_environment);
5302         c->unset_environment = strv_free(c->unset_environment);
5303
5304         rlimit_free_all(c->rlimit);
5305
5306         for (size_t l = 0; l < 3; l++) {
5307                 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
5308                 c->stdio_file[l] = mfree(c->stdio_file[l]);
5309         }
5310
5311         c->working_directory = mfree(c->working_directory);
5312         c->root_directory = mfree(c->root_directory);
5313         c->root_image = mfree(c->root_image);
5314         c->root_image_options = mount_options_free_all(c->root_image_options);
5315         c->root_hash = mfree(c->root_hash);
5316         c->root_hash_size = 0;
5317         c->root_hash_path = mfree(c->root_hash_path);
5318         c->root_hash_sig = mfree(c->root_hash_sig);
5319         c->root_hash_sig_size = 0;
5320         c->root_hash_sig_path = mfree(c->root_hash_sig_path);
5321         c->root_verity = mfree(c->root_verity);
5322         c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
5323         c->extension_directories = strv_free(c->extension_directories);
5324         c->tty_path = mfree(c->tty_path);
5325         c->syslog_identifier = mfree(c->syslog_identifier);
5326         c->user = mfree(c->user);
5327         c->group = mfree(c->group);
5328
5329         c->supplementary_groups = strv_free(c->supplementary_groups);
5330
5331         c->pam_name = mfree(c->pam_name);
5332
5333         c->read_only_paths = strv_free(c->read_only_paths);
5334         c->read_write_paths = strv_free(c->read_write_paths);
5335         c->inaccessible_paths = strv_free(c->inaccessible_paths);
5336         c->exec_paths = strv_free(c->exec_paths);
5337         c->no_exec_paths = strv_free(c->no_exec_paths);
5338         c->exec_search_path = strv_free(c->exec_search_path);
5339
5340         bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
5341         c->bind_mounts = NULL;
5342         c->n_bind_mounts = 0;
5343         temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
5344         c->temporary_filesystems = NULL;
5345         c->n_temporary_filesystems = 0;
5346         c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
5347
5348         cpu_set_reset(&c->cpu_set);
5349         numa_policy_reset(&c->numa_policy);
5350
5351         c->utmp_id = mfree(c->utmp_id);
5352         c->selinux_context = mfree(c->selinux_context);
5353         c->apparmor_profile = mfree(c->apparmor_profile);
5354         c->smack_process_label = mfree(c->smack_process_label);
5355
5356         c->restrict_filesystems = set_free_free(c->restrict_filesystems);
5357
5358         c->syscall_filter = hashmap_free(c->syscall_filter);
5359         c->syscall_archs = set_free(c->syscall_archs);
5360         c->address_families = set_free(c->address_families);
5361
5362         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5363                 exec_directory_done(&c->directories[t]);
5364
5365         c->log_level_max = -1;
5366
5367         exec_context_free_log_extra_fields(c);
5368         c->log_filter_allowed_patterns = set_free_free(c->log_filter_allowed_patterns);
5369         c->log_filter_denied_patterns = set_free_free(c->log_filter_denied_patterns);
5370
5371         c->log_ratelimit_interval_usec = 0;
5372         c->log_ratelimit_burst = 0;
5373
5374         c->stdin_data = mfree(c->stdin_data);
5375         c->stdin_data_size = 0;
5376
5377         c->network_namespace_path = mfree(c->network_namespace_path);
5378         c->ipc_namespace_path = mfree(c->ipc_namespace_path);
5379
5380         c->log_namespace = mfree(c->log_namespace);
5381
5382         c->load_credentials = hashmap_free(c->load_credentials);
5383         c->set_credentials = hashmap_free(c->set_credentials);
5384         c->import_credentials = set_free_free(c->import_credentials);
5385
5386         c->root_image_policy = image_policy_free(c->root_image_policy);
5387         c->mount_image_policy = image_policy_free(c->mount_image_policy);
5388         c->extension_image_policy = image_policy_free(c->extension_image_policy);
5389 }
5390
5391 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
5392         assert(c);
5393
5394         if (!runtime_prefix)
5395                 return 0;
5396
5397         for (size_t i = 0; i < c->directories[EXEC_DIRECTORY_RUNTIME].n_items; i++) {
5398                 _cleanup_free_ char *p = NULL;
5399
5400                 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5401                         p = path_join(runtime_prefix, "private", c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
5402                 else
5403                         p = path_join(runtime_prefix, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
5404                 if (!p)
5405                         return -ENOMEM;
5406
5407                 /* We execute this synchronously, since we need to be sure this is gone when we start the
5408                  * service next. */
5409                 (void) rm_rf(p, REMOVE_ROOT);
5410
5411                 STRV_FOREACH(symlink, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].symlinks) {
5412                         _cleanup_free_ char *symlink_abs = NULL;
5413
5414                         if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5415                                 symlink_abs = path_join(runtime_prefix, "private", *symlink);
5416                         else
5417                                 symlink_abs = path_join(runtime_prefix, *symlink);
5418                         if (!symlink_abs)
5419                                 return -ENOMEM;
5420
5421                         (void) unlink(symlink_abs);
5422                 }
5423         }
5424
5425         return 0;
5426 }
5427
5428 int exec_context_destroy_mount_ns_dir(Unit *u) {
5429         _cleanup_free_ char *p = NULL;
5430
5431         if (!u || !MANAGER_IS_SYSTEM(u->manager))
5432                 return 0;
5433
5434         p = path_join("/run/systemd/propagate/", u->id);
5435         if (!p)
5436                 return -ENOMEM;
5437
5438         /* This is only filled transiently (see mount_in_namespace()), should be empty or even non-existent*/
5439         if (rmdir(p) < 0 && errno != ENOENT)
5440                 log_unit_debug_errno(u, errno, "Unable to remove propagation dir '%s', ignoring: %m", p);
5441
5442         return 0;
5443 }
5444
5445 static void exec_command_done(ExecCommand *c) {
5446         assert(c);
5447
5448         c->path = mfree(c->path);
5449         c->argv = strv_free(c->argv);
5450 }
5451
5452 void exec_command_done_array(ExecCommand *c, size_t n) {
5453         for (size_t i = 0; i < n; i++)
5454                 exec_command_done(c+i);
5455 }
5456
5457 ExecCommand* exec_command_free_list(ExecCommand *c) {
5458         ExecCommand *i;
5459
5460         while ((i = LIST_POP(command, c))) {
5461                 exec_command_done(i);
5462                 free(i);
5463         }
5464
5465         return NULL;
5466 }
5467
5468 void exec_command_free_array(ExecCommand **c, size_t n) {
5469         for (size_t i = 0; i < n; i++)
5470                 c[i] = exec_command_free_list(c[i]);
5471 }
5472
5473 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
5474         for (size_t i = 0; i < n; i++)
5475                 exec_status_reset(&c[i].exec_status);
5476 }
5477
5478 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
5479         for (size_t i = 0; i < n; i++)
5480                 LIST_FOREACH(command, z, c[i])
5481                         exec_status_reset(&z->exec_status);
5482 }
5483
5484 typedef struct InvalidEnvInfo {
5485         const Unit *unit;
5486         const char *path;
5487 } InvalidEnvInfo;
5488
5489 static void invalid_env(const char *p, void *userdata) {
5490         InvalidEnvInfo *info = userdata;
5491
5492         log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
5493 }
5494
5495 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
5496         assert(c);
5497
5498         switch (fd_index) {
5499
5500         case STDIN_FILENO:
5501                 if (c->std_input != EXEC_INPUT_NAMED_FD)
5502                         return NULL;
5503
5504                 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
5505
5506         case STDOUT_FILENO:
5507                 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
5508                         return NULL;
5509
5510                 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
5511
5512         case STDERR_FILENO:
5513                 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
5514                         return NULL;
5515
5516                 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
5517
5518         default:
5519                 return NULL;
5520         }
5521 }
5522
5523 static int exec_context_named_iofds(
5524                 const ExecContext *c,
5525                 const ExecParameters *p,
5526                 int named_iofds[static 3]) {
5527
5528         size_t targets;
5529         const char* stdio_fdname[3];
5530         size_t n_fds;
5531
5532         assert(c);
5533         assert(p);
5534         assert(named_iofds);
5535
5536         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
5537                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
5538                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
5539
5540         for (size_t i = 0; i < 3; i++)
5541                 stdio_fdname[i] = exec_context_fdname(c, i);
5542
5543         n_fds = p->n_storage_fds + p->n_socket_fds;
5544
5545         for (size_t i = 0; i < n_fds  && targets > 0; i++)
5546                 if (named_iofds[STDIN_FILENO] < 0 &&
5547                     c->std_input == EXEC_INPUT_NAMED_FD &&
5548                     stdio_fdname[STDIN_FILENO] &&
5549                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
5550
5551                         named_iofds[STDIN_FILENO] = p->fds[i];
5552                         targets--;
5553
5554                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
5555                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
5556                            stdio_fdname[STDOUT_FILENO] &&
5557                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
5558
5559                         named_iofds[STDOUT_FILENO] = p->fds[i];
5560                         targets--;
5561
5562                 } else if (named_iofds[STDERR_FILENO] < 0 &&
5563                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
5564                            stdio_fdname[STDERR_FILENO] &&
5565                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
5566
5567                         named_iofds[STDERR_FILENO] = p->fds[i];
5568                         targets--;
5569                 }
5570
5571         return targets == 0 ? 0 : -ENOENT;
5572 }
5573
5574 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***ret) {
5575         _cleanup_strv_free_ char **v = NULL;
5576         int r;
5577
5578         assert(c);
5579         assert(ret);
5580
5581         STRV_FOREACH(i, c->environment_files) {
5582                 _cleanup_globfree_ glob_t pglob = {};
5583                 bool ignore = false;
5584                 char *fn = *i;
5585
5586                 if (fn[0] == '-') {
5587                         ignore = true;
5588                         fn++;
5589                 }
5590
5591                 if (!path_is_absolute(fn)) {
5592                         if (ignore)
5593                                 continue;
5594                         return -EINVAL;
5595                 }
5596
5597                 /* Filename supports globbing, take all matching files */
5598                 r = safe_glob(fn, 0, &pglob);
5599                 if (r < 0) {
5600                         if (ignore)
5601                                 continue;
5602                         return r;
5603                 }
5604
5605                 /* When we don't match anything, -ENOENT should be returned */
5606                 assert(pglob.gl_pathc > 0);
5607
5608                 for (size_t n = 0; n < pglob.gl_pathc; n++) {
5609                         _cleanup_strv_free_ char **p = NULL;
5610
5611                         r = load_env_file(NULL, pglob.gl_pathv[n], &p);
5612                         if (r < 0) {
5613                                 if (ignore)
5614                                         continue;
5615                                 return r;
5616                         }
5617
5618                         /* Log invalid environment variables with filename */
5619                         if (p) {
5620                                 InvalidEnvInfo info = {
5621                                         .unit = unit,
5622                                         .path = pglob.gl_pathv[n]
5623                                 };
5624
5625                                 p = strv_env_clean_with_callback(p, invalid_env, &info);
5626                         }
5627
5628                         if (!v)
5629                                 v = TAKE_PTR(p);
5630                         else {
5631                                 char **m = strv_env_merge(v, p);
5632                                 if (!m)
5633                                         return -ENOMEM;
5634
5635                                 strv_free_and_replace(v, m);
5636                         }
5637                 }
5638         }
5639
5640         *ret = TAKE_PTR(v);
5641
5642         return 0;
5643 }
5644
5645 static bool tty_may_match_dev_console(const char *tty) {
5646         _cleanup_free_ char *resolved = NULL;
5647
5648         if (!tty)
5649                 return true;
5650
5651         tty = skip_dev_prefix(tty);
5652
5653         /* trivial identity? */
5654         if (streq(tty, "console"))
5655                 return true;
5656
5657         if (resolve_dev_console(&resolved) < 0)
5658                 return true; /* if we could not resolve, assume it may */
5659
5660         /* "tty0" means the active VC, so it may be the same sometimes */
5661         return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
5662 }
5663
5664 static bool exec_context_may_touch_tty(const ExecContext *ec) {
5665         assert(ec);
5666
5667         return ec->tty_reset ||
5668                 ec->tty_vhangup ||
5669                 ec->tty_vt_disallocate ||
5670                 is_terminal_input(ec->std_input) ||
5671                 is_terminal_output(ec->std_output) ||
5672                 is_terminal_output(ec->std_error);
5673 }
5674
5675 bool exec_context_may_touch_console(const ExecContext *ec) {
5676
5677         return exec_context_may_touch_tty(ec) &&
5678                tty_may_match_dev_console(exec_context_tty_path(ec));
5679 }
5680
5681 static void strv_fprintf(FILE *f, char **l) {
5682         assert(f);
5683
5684         STRV_FOREACH(g, l)
5685                 fprintf(f, " %s", *g);
5686 }
5687
5688 static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
5689         assert(f);
5690         assert(prefix);
5691         assert(name);
5692
5693         if (!strv_isempty(strv)) {
5694                 fprintf(f, "%s%s:", prefix, name);
5695                 strv_fprintf(f, strv);
5696                 fputs("\n", f);
5697         }
5698 }
5699
5700 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
5701         int r;
5702
5703         assert(c);
5704         assert(f);
5705
5706         prefix = strempty(prefix);
5707
5708         fprintf(f,
5709                 "%sUMask: %04o\n"
5710                 "%sWorkingDirectory: %s\n"
5711                 "%sRootDirectory: %s\n"
5712                 "%sRootEphemeral: %s\n"
5713                 "%sNonBlocking: %s\n"
5714                 "%sPrivateTmp: %s\n"
5715                 "%sPrivateDevices: %s\n"
5716                 "%sProtectKernelTunables: %s\n"
5717                 "%sProtectKernelModules: %s\n"
5718                 "%sProtectKernelLogs: %s\n"
5719                 "%sProtectClock: %s\n"
5720                 "%sProtectControlGroups: %s\n"
5721                 "%sPrivateNetwork: %s\n"
5722                 "%sPrivateUsers: %s\n"
5723                 "%sProtectHome: %s\n"
5724                 "%sProtectSystem: %s\n"
5725                 "%sMountAPIVFS: %s\n"
5726                 "%sIgnoreSIGPIPE: %s\n"
5727                 "%sMemoryDenyWriteExecute: %s\n"
5728                 "%sRestrictRealtime: %s\n"
5729                 "%sRestrictSUIDSGID: %s\n"
5730                 "%sKeyringMode: %s\n"
5731                 "%sProtectHostname: %s\n"
5732                 "%sProtectProc: %s\n"
5733                 "%sProcSubset: %s\n",
5734                 prefix, c->umask,
5735                 prefix, empty_to_root(c->working_directory),
5736                 prefix, empty_to_root(c->root_directory),
5737                 prefix, yes_no(c->root_ephemeral),
5738                 prefix, yes_no(c->non_blocking),
5739                 prefix, yes_no(c->private_tmp),
5740                 prefix, yes_no(c->private_devices),
5741                 prefix, yes_no(c->protect_kernel_tunables),
5742                 prefix, yes_no(c->protect_kernel_modules),
5743                 prefix, yes_no(c->protect_kernel_logs),
5744                 prefix, yes_no(c->protect_clock),
5745                 prefix, yes_no(c->protect_control_groups),
5746                 prefix, yes_no(c->private_network),
5747                 prefix, yes_no(c->private_users),
5748                 prefix, protect_home_to_string(c->protect_home),
5749                 prefix, protect_system_to_string(c->protect_system),
5750                 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
5751                 prefix, yes_no(c->ignore_sigpipe),
5752                 prefix, yes_no(c->memory_deny_write_execute),
5753                 prefix, yes_no(c->restrict_realtime),
5754                 prefix, yes_no(c->restrict_suid_sgid),
5755                 prefix, exec_keyring_mode_to_string(c->keyring_mode),
5756                 prefix, yes_no(c->protect_hostname),
5757                 prefix, protect_proc_to_string(c->protect_proc),
5758                 prefix, proc_subset_to_string(c->proc_subset));
5759
5760         if (c->root_image)
5761                 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
5762
5763         if (c->root_image_options) {
5764                 fprintf(f, "%sRootImageOptions:", prefix);
5765                 LIST_FOREACH(mount_options, o, c->root_image_options)
5766                         if (!isempty(o->options))
5767                                 fprintf(f, " %s:%s",
5768                                         partition_designator_to_string(o->partition_designator),
5769                                         o->options);
5770                 fprintf(f, "\n");
5771         }
5772
5773         if (c->root_hash) {
5774                 _cleanup_free_ char *encoded = NULL;
5775                 encoded = hexmem(c->root_hash, c->root_hash_size);
5776                 if (encoded)
5777                         fprintf(f, "%sRootHash: %s\n", prefix, encoded);
5778         }
5779
5780         if (c->root_hash_path)
5781                 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
5782
5783         if (c->root_hash_sig) {
5784                 _cleanup_free_ char *encoded = NULL;
5785                 ssize_t len;
5786                 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
5787                 if (len)
5788                         fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
5789         }
5790
5791         if (c->root_hash_sig_path)
5792                 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
5793
5794         if (c->root_verity)
5795                 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
5796
5797         STRV_FOREACH(e, c->environment)
5798                 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
5799
5800         STRV_FOREACH(e, c->environment_files)
5801                 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
5802
5803         STRV_FOREACH(e, c->pass_environment)
5804                 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
5805
5806         STRV_FOREACH(e, c->unset_environment)
5807                 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
5808
5809         fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
5810
5811         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
5812                 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
5813
5814                 for (size_t i = 0; i < c->directories[dt].n_items; i++) {
5815                         fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].items[i].path);
5816
5817                         STRV_FOREACH(d, c->directories[dt].items[i].symlinks)
5818                                 fprintf(f, "%s%s: %s:%s\n", prefix, exec_directory_type_symlink_to_string(dt), c->directories[dt].items[i].path, *d);
5819                 }
5820         }
5821
5822         fprintf(f, "%sTimeoutCleanSec: %s\n", prefix, FORMAT_TIMESPAN(c->timeout_clean_usec, USEC_PER_SEC));
5823
5824         if (c->nice_set)
5825                 fprintf(f, "%sNice: %i\n", prefix, c->nice);
5826
5827         if (c->oom_score_adjust_set)
5828                 fprintf(f, "%sOOMScoreAdjust: %i\n", prefix, c->oom_score_adjust);
5829
5830         if (c->coredump_filter_set)
5831                 fprintf(f, "%sCoredumpFilter: 0x%"PRIx64"\n", prefix, c->coredump_filter);
5832
5833         for (unsigned i = 0; i < RLIM_NLIMITS; i++)
5834                 if (c->rlimit[i]) {
5835                         fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
5836                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
5837                         fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
5838                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
5839                 }
5840
5841         if (c->ioprio_set) {
5842                 _cleanup_free_ char *class_str = NULL;
5843
5844                 r = ioprio_class_to_string_alloc(ioprio_prio_class(c->ioprio), &class_str);
5845                 if (r >= 0)
5846                         fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
5847
5848                 fprintf(f, "%sIOPriority: %d\n", prefix, ioprio_prio_data(c->ioprio));
5849         }
5850
5851         if (c->cpu_sched_set) {
5852                 _cleanup_free_ char *policy_str = NULL;
5853
5854                 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
5855                 if (r >= 0)
5856                         fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
5857
5858                 fprintf(f,
5859                         "%sCPUSchedulingPriority: %i\n"
5860                         "%sCPUSchedulingResetOnFork: %s\n",
5861                         prefix, c->cpu_sched_priority,
5862                         prefix, yes_no(c->cpu_sched_reset_on_fork));
5863         }
5864
5865         if (c->cpu_set.set) {
5866                 _cleanup_free_ char *affinity = NULL;
5867
5868                 affinity = cpu_set_to_range_string(&c->cpu_set);
5869                 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
5870         }
5871
5872         if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
5873                 _cleanup_free_ char *nodes = NULL;
5874
5875                 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
5876                 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
5877                 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
5878         }
5879
5880         if (c->timer_slack_nsec != NSEC_INFINITY)
5881                 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
5882
5883         fprintf(f,
5884                 "%sStandardInput: %s\n"
5885                 "%sStandardOutput: %s\n"
5886                 "%sStandardError: %s\n",
5887                 prefix, exec_input_to_string(c->std_input),
5888                 prefix, exec_output_to_string(c->std_output),
5889                 prefix, exec_output_to_string(c->std_error));
5890
5891         if (c->std_input == EXEC_INPUT_NAMED_FD)
5892                 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
5893         if (c->std_output == EXEC_OUTPUT_NAMED_FD)
5894                 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
5895         if (c->std_error == EXEC_OUTPUT_NAMED_FD)
5896                 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
5897
5898         if (c->std_input == EXEC_INPUT_FILE)
5899                 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
5900         if (c->std_output == EXEC_OUTPUT_FILE)
5901                 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5902         if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
5903                 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5904         if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
5905                 fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5906         if (c->std_error == EXEC_OUTPUT_FILE)
5907                 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5908         if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
5909                 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5910         if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
5911                 fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5912
5913         if (c->tty_path)
5914                 fprintf(f,
5915                         "%sTTYPath: %s\n"
5916                         "%sTTYReset: %s\n"
5917                         "%sTTYVHangup: %s\n"
5918                         "%sTTYVTDisallocate: %s\n"
5919                         "%sTTYRows: %u\n"
5920                         "%sTTYColumns: %u\n",
5921                         prefix, c->tty_path,
5922                         prefix, yes_no(c->tty_reset),
5923                         prefix, yes_no(c->tty_vhangup),
5924                         prefix, yes_no(c->tty_vt_disallocate),
5925                         prefix, c->tty_rows,
5926                         prefix, c->tty_cols);
5927
5928         if (IN_SET(c->std_output,
5929                    EXEC_OUTPUT_KMSG,
5930                    EXEC_OUTPUT_JOURNAL,
5931                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
5932                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
5933             IN_SET(c->std_error,
5934                    EXEC_OUTPUT_KMSG,
5935                    EXEC_OUTPUT_JOURNAL,
5936                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
5937                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
5938
5939                 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
5940
5941                 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
5942                 if (r >= 0)
5943                         fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
5944
5945                 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
5946                 if (r >= 0)
5947                         fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
5948         }
5949
5950         if (c->log_level_max >= 0) {
5951                 _cleanup_free_ char *t = NULL;
5952
5953                 (void) log_level_to_string_alloc(c->log_level_max, &t);
5954
5955                 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
5956         }
5957
5958         if (c->log_ratelimit_interval_usec > 0)
5959                 fprintf(f,
5960                         "%sLogRateLimitIntervalSec: %s\n",
5961                         prefix, FORMAT_TIMESPAN(c->log_ratelimit_interval_usec, USEC_PER_SEC));
5962
5963         if (c->log_ratelimit_burst > 0)
5964                 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
5965
5966         if (!set_isempty(c->log_filter_allowed_patterns) || !set_isempty(c->log_filter_denied_patterns)) {
5967                 fprintf(f, "%sLogFilterPatterns:", prefix);
5968
5969                 char *pattern;
5970                 SET_FOREACH(pattern, c->log_filter_allowed_patterns)
5971                         fprintf(f, " %s", pattern);
5972                 SET_FOREACH(pattern, c->log_filter_denied_patterns)
5973                         fprintf(f, " ~%s", pattern);
5974                 fputc('\n', f);
5975         }
5976
5977         for (size_t j = 0; j < c->n_log_extra_fields; j++) {
5978                 fprintf(f, "%sLogExtraFields: ", prefix);
5979                 fwrite(c->log_extra_fields[j].iov_base,
5980                        1, c->log_extra_fields[j].iov_len,
5981                        f);
5982                 fputc('\n', f);
5983         }
5984
5985         if (c->log_namespace)
5986                 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
5987
5988         if (c->secure_bits) {
5989                 _cleanup_free_ char *str = NULL;
5990
5991                 r = secure_bits_to_string_alloc(c->secure_bits, &str);
5992                 if (r >= 0)
5993                         fprintf(f, "%sSecure Bits: %s\n", prefix, str);
5994         }
5995
5996         if (c->capability_bounding_set != CAP_MASK_UNSET) {
5997                 _cleanup_free_ char *str = NULL;
5998
5999                 r = capability_set_to_string(c->capability_bounding_set, &str);
6000                 if (r >= 0)
6001                         fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
6002         }
6003
6004         if (c->capability_ambient_set != 0) {
6005                 _cleanup_free_ char *str = NULL;
6006
6007                 r = capability_set_to_string(c->capability_ambient_set, &str);
6008                 if (r >= 0)
6009                         fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
6010         }
6011
6012         if (c->user)
6013                 fprintf(f, "%sUser: %s\n", prefix, c->user);
6014         if (c->group)
6015                 fprintf(f, "%sGroup: %s\n", prefix, c->group);
6016
6017         fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
6018
6019         strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
6020
6021         if (c->pam_name)
6022                 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
6023
6024         strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
6025         strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
6026         strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
6027         strv_dump(f, prefix, "ExecPaths", c->exec_paths);
6028         strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
6029         strv_dump(f, prefix, "ExecSearchPath", c->exec_search_path);
6030
6031         for (size_t i = 0; i < c->n_bind_mounts; i++)
6032                 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
6033                         c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
6034                         c->bind_mounts[i].ignore_enoent ? "-": "",
6035                         c->bind_mounts[i].source,
6036                         c->bind_mounts[i].destination,
6037                         c->bind_mounts[i].recursive ? "rbind" : "norbind");
6038
6039         for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
6040                 const TemporaryFileSystem *t = c->temporary_filesystems + i;
6041
6042                 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
6043                         t->path,
6044                         isempty(t->options) ? "" : ":",
6045                         strempty(t->options));
6046         }
6047
6048         if (c->utmp_id)
6049                 fprintf(f,
6050                         "%sUtmpIdentifier: %s\n",
6051                         prefix, c->utmp_id);
6052
6053         if (c->selinux_context)
6054                 fprintf(f,
6055                         "%sSELinuxContext: %s%s\n",
6056                         prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
6057
6058         if (c->apparmor_profile)
6059                 fprintf(f,
6060                         "%sAppArmorProfile: %s%s\n",
6061                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
6062
6063         if (c->smack_process_label)
6064                 fprintf(f,
6065                         "%sSmackProcessLabel: %s%s\n",
6066                         prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
6067
6068         if (c->personality != PERSONALITY_INVALID)
6069                 fprintf(f,
6070                         "%sPersonality: %s\n",
6071                         prefix, strna(personality_to_string(c->personality)));
6072
6073         fprintf(f,
6074                 "%sLockPersonality: %s\n",
6075                 prefix, yes_no(c->lock_personality));
6076
6077         if (c->syscall_filter) {
6078                 fprintf(f,
6079                         "%sSystemCallFilter: ",
6080                         prefix);
6081
6082                 if (!c->syscall_allow_list)
6083                         fputc('~', f);
6084
6085 #if HAVE_SECCOMP
6086                 void *id, *val;
6087                 bool first = true;
6088                 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
6089                         _cleanup_free_ char *name = NULL;
6090                         const char *errno_name = NULL;
6091                         int num = PTR_TO_INT(val);
6092
6093                         if (first)
6094                                 first = false;
6095                         else
6096                                 fputc(' ', f);
6097
6098                         name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
6099                         fputs(strna(name), f);
6100
6101                         if (num >= 0) {
6102                                 errno_name = seccomp_errno_or_action_to_string(num);
6103                                 if (errno_name)
6104                                         fprintf(f, ":%s", errno_name);
6105                                 else
6106                                         fprintf(f, ":%d", num);
6107                         }
6108                 }
6109 #endif
6110
6111                 fputc('\n', f);
6112         }
6113
6114         if (c->syscall_archs) {
6115                 fprintf(f,
6116                         "%sSystemCallArchitectures:",
6117                         prefix);
6118
6119 #if HAVE_SECCOMP
6120                 void *id;
6121                 SET_FOREACH(id, c->syscall_archs)
6122                         fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
6123 #endif
6124                 fputc('\n', f);
6125         }
6126
6127         if (exec_context_restrict_namespaces_set(c)) {
6128                 _cleanup_free_ char *s = NULL;
6129
6130                 r = namespace_flags_to_string(c->restrict_namespaces, &s);
6131                 if (r >= 0)
6132                         fprintf(f, "%sRestrictNamespaces: %s\n",
6133                                 prefix, strna(s));
6134         }
6135
6136 #if HAVE_LIBBPF
6137         if (exec_context_restrict_filesystems_set(c)) {
6138                 char *fs;
6139                 SET_FOREACH(fs, c->restrict_filesystems)
6140                         fprintf(f, "%sRestrictFileSystems: %s\n", prefix, fs);
6141         }
6142 #endif
6143
6144         if (c->network_namespace_path)
6145                 fprintf(f,
6146                         "%sNetworkNamespacePath: %s\n",
6147                         prefix, c->network_namespace_path);
6148
6149         if (c->syscall_errno > 0) {
6150                 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
6151
6152 #if HAVE_SECCOMP
6153                 const char *errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
6154                 if (errno_name)
6155                         fputs(errno_name, f);
6156                 else
6157                         fprintf(f, "%d", c->syscall_errno);
6158 #endif
6159                 fputc('\n', f);
6160         }
6161
6162         for (size_t i = 0; i < c->n_mount_images; i++) {
6163                 fprintf(f, "%sMountImages: %s%s:%s", prefix,
6164                         c->mount_images[i].ignore_enoent ? "-": "",
6165                         c->mount_images[i].source,
6166                         c->mount_images[i].destination);
6167                 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
6168                         fprintf(f, ":%s:%s",
6169                                 partition_designator_to_string(o->partition_designator),
6170                                 strempty(o->options));
6171                 fprintf(f, "\n");
6172         }
6173
6174         for (size_t i = 0; i < c->n_extension_images; i++) {
6175                 fprintf(f, "%sExtensionImages: %s%s", prefix,
6176                         c->extension_images[i].ignore_enoent ? "-": "",
6177                         c->extension_images[i].source);
6178                 LIST_FOREACH(mount_options, o, c->extension_images[i].mount_options)
6179                         fprintf(f, ":%s:%s",
6180                                 partition_designator_to_string(o->partition_designator),
6181                                 strempty(o->options));
6182                 fprintf(f, "\n");
6183         }
6184
6185         strv_dump(f, prefix, "ExtensionDirectories", c->extension_directories);
6186 }
6187
6188 bool exec_context_maintains_privileges(const ExecContext *c) {
6189         assert(c);
6190
6191         /* Returns true if the process forked off would run under
6192          * an unchanged UID or as root. */
6193
6194         if (!c->user)
6195                 return true;
6196
6197         if (streq(c->user, "root") || streq(c->user, "0"))
6198                 return true;
6199
6200         return false;
6201 }
6202
6203 int exec_context_get_effective_ioprio(const ExecContext *c) {
6204         int p;
6205
6206         assert(c);
6207
6208         if (c->ioprio_set)
6209                 return c->ioprio;
6210
6211         p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
6212         if (p < 0)
6213                 return IOPRIO_DEFAULT_CLASS_AND_PRIO;
6214
6215         return ioprio_normalize(p);
6216 }
6217
6218 bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
6219         assert(c);
6220
6221         /* Explicit setting wins */
6222         if (c->mount_apivfs_set)
6223                 return c->mount_apivfs;
6224
6225         /* Default to "yes" if root directory or image are specified */
6226         if (exec_context_with_rootfs(c))
6227                 return true;
6228
6229         return false;
6230 }
6231
6232 void exec_context_free_log_extra_fields(ExecContext *c) {
6233         assert(c);
6234
6235         for (size_t l = 0; l < c->n_log_extra_fields; l++)
6236                 free(c->log_extra_fields[l].iov_base);
6237         c->log_extra_fields = mfree(c->log_extra_fields);
6238         c->n_log_extra_fields = 0;
6239 }
6240
6241 void exec_context_revert_tty(ExecContext *c) {
6242         _cleanup_close_ int fd = -EBADF;
6243         const char *path;
6244         struct stat st;
6245         int r;
6246
6247         assert(c);
6248
6249         /* First, reset the TTY (possibly kicking everybody else from the TTY) */
6250         exec_context_tty_reset(c, NULL);
6251
6252         /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
6253          * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
6254          * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
6255         if (!exec_context_may_touch_tty(c))
6256                 return;
6257
6258         path = exec_context_tty_path(c);
6259         if (!path)
6260                 return;
6261
6262         fd = open(path, O_PATH|O_CLOEXEC);
6263         if (fd < 0)
6264                 return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
6265                                              "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
6266                                              path);
6267
6268         if (fstat(fd, &st) < 0)
6269                 return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
6270
6271         /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
6272          * if things are a character device, since a proper check either means we'd have to open the TTY and
6273          * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
6274          * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
6275          * with this at all? → https://github.com/systemd/systemd/issues/19213 */
6276         if (!S_ISCHR(st.st_mode))
6277                 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
6278
6279         r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
6280         if (r < 0)
6281                 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
6282 }
6283
6284 int exec_context_get_clean_directories(
6285                 ExecContext *c,
6286                 char **prefix,
6287                 ExecCleanMask mask,
6288                 char ***ret) {
6289
6290         _cleanup_strv_free_ char **l = NULL;
6291         int r;
6292
6293         assert(c);
6294         assert(prefix);
6295         assert(ret);
6296
6297         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6298                 if (!FLAGS_SET(mask, 1U << t))
6299                         continue;
6300
6301                 if (!prefix[t])
6302                         continue;
6303
6304                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
6305                         char *j;
6306
6307                         j = path_join(prefix[t], c->directories[t].items[i].path);
6308                         if (!j)
6309                                 return -ENOMEM;
6310
6311                         r = strv_consume(&l, j);
6312                         if (r < 0)
6313                                 return r;
6314
6315                         /* Also remove private directories unconditionally. */
6316                         if (t != EXEC_DIRECTORY_CONFIGURATION) {
6317                                 j = path_join(prefix[t], "private", c->directories[t].items[i].path);
6318                                 if (!j)
6319                                         return -ENOMEM;
6320
6321                                 r = strv_consume(&l, j);
6322                                 if (r < 0)
6323                                         return r;
6324                         }
6325
6326                         STRV_FOREACH(symlink, c->directories[t].items[i].symlinks) {
6327                                 j = path_join(prefix[t], *symlink);
6328                                 if (!j)
6329                                         return -ENOMEM;
6330
6331                                 r = strv_consume(&l, j);
6332                                 if (r < 0)
6333                                         return r;
6334                         }
6335                 }
6336         }
6337
6338         *ret = TAKE_PTR(l);
6339         return 0;
6340 }
6341
6342 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
6343         ExecCleanMask mask = 0;
6344
6345         assert(c);
6346         assert(ret);
6347
6348         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
6349                 if (c->directories[t].n_items > 0)
6350                         mask |= 1U << t;
6351
6352         *ret = mask;
6353         return 0;
6354 }
6355
6356 void exec_status_start(ExecStatus *s, pid_t pid) {
6357         assert(s);
6358
6359         *s = (ExecStatus) {
6360                 .pid = pid,
6361         };
6362
6363         dual_timestamp_get(&s->start_timestamp);
6364 }
6365
6366 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
6367         assert(s);
6368
6369         if (s->pid != pid)
6370                 *s = (ExecStatus) {
6371                         .pid = pid,
6372                 };
6373
6374         dual_timestamp_get(&s->exit_timestamp);
6375
6376         s->code = code;
6377         s->status = status;
6378
6379         if (context && context->utmp_id)
6380                 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
6381 }
6382
6383 void exec_status_reset(ExecStatus *s) {
6384         assert(s);
6385
6386         *s = (ExecStatus) {};
6387 }
6388
6389 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
6390         assert(s);
6391         assert(f);
6392
6393         if (s->pid <= 0)
6394                 return;
6395
6396         prefix = strempty(prefix);
6397
6398         fprintf(f,
6399                 "%sPID: "PID_FMT"\n",
6400                 prefix, s->pid);
6401
6402         if (dual_timestamp_is_set(&s->start_timestamp))
6403                 fprintf(f,
6404                         "%sStart Timestamp: %s\n",
6405                         prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
6406
6407         if (dual_timestamp_is_set(&s->exit_timestamp))
6408                 fprintf(f,
6409                         "%sExit Timestamp: %s\n"
6410                         "%sExit Code: %s\n"
6411                         "%sExit Status: %i\n",
6412                         prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime),
6413                         prefix, sigchld_code_to_string(s->code),
6414                         prefix, s->status);
6415 }
6416
6417 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
6418         _cleanup_free_ char *cmd = NULL;
6419         const char *prefix2;
6420
6421         assert(c);
6422         assert(f);
6423
6424         prefix = strempty(prefix);
6425         prefix2 = strjoina(prefix, "\t");
6426
6427         cmd = quote_command_line(c->argv, SHELL_ESCAPE_EMPTY);
6428
6429         fprintf(f,
6430                 "%sCommand Line: %s\n",
6431                 prefix, strnull(cmd));
6432
6433         exec_status_dump(&c->exec_status, f, prefix2);
6434 }
6435
6436 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
6437         assert(f);
6438
6439         prefix = strempty(prefix);
6440
6441         LIST_FOREACH(command, i, c)
6442                 exec_command_dump(i, f, prefix);
6443 }
6444
6445 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
6446         ExecCommand *end;
6447
6448         assert(l);
6449         assert(e);
6450
6451         if (*l) {
6452                 /* It's kind of important, that we keep the order here */
6453                 end = LIST_FIND_TAIL(command, *l);
6454                 LIST_INSERT_AFTER(command, *l, end, e);
6455         } else
6456                 *l = e;
6457 }
6458
6459 int exec_command_set(ExecCommand *c, const char *path, ...) {
6460         va_list ap;
6461         char **l, *p;
6462
6463         assert(c);
6464         assert(path);
6465
6466         va_start(ap, path);
6467         l = strv_new_ap(path, ap);
6468         va_end(ap);
6469
6470         if (!l)
6471                 return -ENOMEM;
6472
6473         p = strdup(path);
6474         if (!p) {
6475                 strv_free(l);
6476                 return -ENOMEM;
6477         }
6478
6479         free_and_replace(c->path, p);
6480
6481         return strv_free_and_replace(c->argv, l);
6482 }
6483
6484 int exec_command_append(ExecCommand *c, const char *path, ...) {
6485         _cleanup_strv_free_ char **l = NULL;
6486         va_list ap;
6487         int r;
6488
6489         assert(c);
6490         assert(path);
6491
6492         va_start(ap, path);
6493         l = strv_new_ap(path, ap);
6494         va_end(ap);
6495
6496         if (!l)
6497                 return -ENOMEM;
6498
6499         r = strv_extend_strv(&c->argv, l, false);
6500         if (r < 0)
6501                 return r;
6502
6503         return 0;
6504 }
6505
6506 static char *destroy_tree(char *path) {
6507         if (!path)
6508                 return NULL;
6509
6510         if (!path_equal(path, RUN_SYSTEMD_EMPTY)) {
6511                 log_debug("Spawning process to nuke '%s'", path);
6512
6513                 (void) asynchronous_rm_rf(path, REMOVE_ROOT|REMOVE_SUBVOLUME|REMOVE_PHYSICAL);
6514         }
6515
6516         return mfree(path);
6517 }
6518
6519 static ExecSharedRuntime* exec_shared_runtime_free(ExecSharedRuntime *rt) {
6520         if (!rt)
6521                 return NULL;
6522
6523         if (rt->manager)
6524                 (void) hashmap_remove(rt->manager->exec_shared_runtime_by_id, rt->id);
6525
6526         rt->id = mfree(rt->id);
6527         rt->tmp_dir = mfree(rt->tmp_dir);
6528         rt->var_tmp_dir = mfree(rt->var_tmp_dir);
6529         safe_close_pair(rt->netns_storage_socket);
6530         safe_close_pair(rt->ipcns_storage_socket);
6531         return mfree(rt);
6532 }
6533
6534 DEFINE_TRIVIAL_UNREF_FUNC(ExecSharedRuntime, exec_shared_runtime, exec_shared_runtime_free);
6535 DEFINE_TRIVIAL_CLEANUP_FUNC(ExecSharedRuntime*, exec_shared_runtime_free);
6536
6537 ExecSharedRuntime* exec_shared_runtime_destroy(ExecSharedRuntime *rt) {
6538         if (!rt)
6539                 return NULL;
6540
6541         assert(rt->n_ref > 0);
6542         rt->n_ref--;
6543
6544         if (rt->n_ref > 0)
6545                 return NULL;
6546
6547         rt->tmp_dir = destroy_tree(rt->tmp_dir);
6548         rt->var_tmp_dir = destroy_tree(rt->var_tmp_dir);
6549
6550         return exec_shared_runtime_free(rt);
6551 }
6552
6553 static int exec_shared_runtime_allocate(ExecSharedRuntime **ret, const char *id) {
6554         _cleanup_free_ char *id_copy = NULL;
6555         ExecSharedRuntime *n;
6556
6557         assert(ret);
6558
6559         id_copy = strdup(id);
6560         if (!id_copy)
6561                 return -ENOMEM;
6562
6563         n = new(ExecSharedRuntime, 1);
6564         if (!n)
6565                 return -ENOMEM;
6566
6567         *n = (ExecSharedRuntime) {
6568                 .id = TAKE_PTR(id_copy),
6569                 .netns_storage_socket = PIPE_EBADF,
6570                 .ipcns_storage_socket = PIPE_EBADF,
6571         };
6572
6573         *ret = n;
6574         return 0;
6575 }
6576
6577 static int exec_shared_runtime_add(
6578                 Manager *m,
6579                 const char *id,
6580                 char **tmp_dir,
6581                 char **var_tmp_dir,
6582                 int netns_storage_socket[2],
6583                 int ipcns_storage_socket[2],
6584                 ExecSharedRuntime **ret) {
6585
6586         _cleanup_(exec_shared_runtime_freep) ExecSharedRuntime *rt = NULL;
6587         int r;
6588
6589         assert(m);
6590         assert(id);
6591
6592         /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
6593
6594         r = exec_shared_runtime_allocate(&rt, id);
6595         if (r < 0)
6596                 return r;
6597
6598         r = hashmap_ensure_put(&m->exec_shared_runtime_by_id, &string_hash_ops, rt->id, rt);
6599         if (r < 0)
6600                 return r;
6601
6602         assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
6603         rt->tmp_dir = TAKE_PTR(*tmp_dir);
6604         rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
6605
6606         if (netns_storage_socket) {
6607                 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
6608                 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
6609         }
6610
6611         if (ipcns_storage_socket) {
6612                 rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
6613                 rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
6614         }
6615
6616         rt->manager = m;
6617
6618         if (ret)
6619                 *ret = rt;
6620         /* do not remove created ExecSharedRuntime object when the operation succeeds. */
6621         TAKE_PTR(rt);
6622         return 0;
6623 }
6624
6625 static int exec_shared_runtime_make(
6626                 Manager *m,
6627                 const ExecContext *c,
6628                 const char *id,
6629                 ExecSharedRuntime **ret) {
6630
6631         _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
6632         _cleanup_close_pair_ int netns_storage_socket[2] = PIPE_EBADF, ipcns_storage_socket[2] = PIPE_EBADF;
6633         int r;
6634
6635         assert(m);
6636         assert(c);
6637         assert(id);
6638
6639         /* It is not necessary to create ExecSharedRuntime object. */
6640         if (!exec_needs_network_namespace(c) && !exec_needs_ipc_namespace(c) && !c->private_tmp) {
6641                 *ret = NULL;
6642                 return 0;
6643         }
6644
6645         if (c->private_tmp &&
6646             !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
6647               (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
6648                prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
6649                 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
6650                 if (r < 0)
6651                         return r;
6652         }
6653
6654         if (exec_needs_network_namespace(c)) {
6655                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
6656                         return -errno;
6657         }
6658
6659         if (exec_needs_ipc_namespace(c)) {
6660                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
6661                         return -errno;
6662         }
6663
6664         r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
6665         if (r < 0)
6666                 return r;
6667
6668         return 1;
6669 }
6670
6671 int exec_shared_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecSharedRuntime **ret) {
6672         ExecSharedRuntime *rt;
6673         int r;
6674
6675         assert(m);
6676         assert(id);
6677         assert(ret);
6678
6679         rt = hashmap_get(m->exec_shared_runtime_by_id, id);
6680         if (rt)
6681                 /* We already have an ExecSharedRuntime object, let's increase the ref count and reuse it */
6682                 goto ref;
6683
6684         if (!create) {
6685                 *ret = NULL;
6686                 return 0;
6687         }
6688
6689         /* If not found, then create a new object. */
6690         r = exec_shared_runtime_make(m, c, id, &rt);
6691         if (r < 0)
6692                 return r;
6693         if (r == 0) {
6694                 /* When r == 0, it is not necessary to create ExecSharedRuntime object. */
6695                 *ret = NULL;
6696                 return 0;
6697         }
6698
6699 ref:
6700         /* increment reference counter. */
6701         rt->n_ref++;
6702         *ret = rt;
6703         return 1;
6704 }
6705
6706 int exec_shared_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
6707         ExecSharedRuntime *rt;
6708
6709         assert(m);
6710         assert(f);
6711         assert(fds);
6712
6713         HASHMAP_FOREACH(rt, m->exec_shared_runtime_by_id) {
6714                 fprintf(f, "exec-runtime=%s", rt->id);
6715
6716                 if (rt->tmp_dir)
6717                         fprintf(f, " tmp-dir=%s", rt->tmp_dir);
6718
6719                 if (rt->var_tmp_dir)
6720                         fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
6721
6722                 if (rt->netns_storage_socket[0] >= 0) {
6723                         int copy;
6724
6725                         copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
6726                         if (copy < 0)
6727                                 return copy;
6728
6729                         fprintf(f, " netns-socket-0=%i", copy);
6730                 }
6731
6732                 if (rt->netns_storage_socket[1] >= 0) {
6733                         int copy;
6734
6735                         copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
6736                         if (copy < 0)
6737                                 return copy;
6738
6739                         fprintf(f, " netns-socket-1=%i", copy);
6740                 }
6741
6742                 if (rt->ipcns_storage_socket[0] >= 0) {
6743                         int copy;
6744
6745                         copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
6746                         if (copy < 0)
6747                                 return copy;
6748
6749                         fprintf(f, " ipcns-socket-0=%i", copy);
6750                 }
6751
6752                 if (rt->ipcns_storage_socket[1] >= 0) {
6753                         int copy;
6754
6755                         copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
6756                         if (copy < 0)
6757                                 return copy;
6758
6759                         fprintf(f, " ipcns-socket-1=%i", copy);
6760                 }
6761
6762                 fputc('\n', f);
6763         }
6764
6765         return 0;
6766 }
6767
6768 int exec_shared_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
6769         _cleanup_(exec_shared_runtime_freep) ExecSharedRuntime *rt_create = NULL;
6770         ExecSharedRuntime *rt;
6771         int r;
6772
6773         /* This is for the migration from old (v237 or earlier) deserialization text.
6774          * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
6775          * Even if the ExecSharedRuntime object originally created by the other unit, we cannot judge
6776          * so or not from the serialized text, then we always creates a new object owned by this. */
6777
6778         assert(u);
6779         assert(key);
6780         assert(value);
6781
6782         /* Manager manages ExecSharedRuntime objects by the unit id.
6783          * So, we omit the serialized text when the unit does not have id (yet?)... */
6784         if (isempty(u->id)) {
6785                 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
6786                 return 0;
6787         }
6788
6789         if (hashmap_ensure_allocated(&u->manager->exec_shared_runtime_by_id, &string_hash_ops) < 0)
6790                 return log_oom();
6791
6792         rt = hashmap_get(u->manager->exec_shared_runtime_by_id, u->id);
6793         if (!rt) {
6794                 if (exec_shared_runtime_allocate(&rt_create, u->id) < 0)
6795                         return log_oom();
6796
6797                 rt = rt_create;
6798         }
6799
6800         if (streq(key, "tmp-dir")) {
6801                 if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
6802                         return -ENOMEM;
6803
6804         } else if (streq(key, "var-tmp-dir")) {
6805                 if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
6806                         return -ENOMEM;
6807
6808         } else if (streq(key, "netns-socket-0")) {
6809                 int fd;
6810
6811                 if ((fd = parse_fd(value)) < 0 || !fdset_contains(fds, fd)) {
6812                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
6813                         return 0;
6814                 }
6815
6816                 safe_close(rt->netns_storage_socket[0]);
6817                 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
6818
6819         } else if (streq(key, "netns-socket-1")) {
6820                 int fd;
6821
6822                 if ((fd = parse_fd(value)) < 0 || !fdset_contains(fds, fd)) {
6823                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
6824                         return 0;
6825                 }
6826
6827                 safe_close(rt->netns_storage_socket[1]);
6828                 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
6829
6830         } else
6831                 return 0;
6832
6833         /* If the object is newly created, then put it to the hashmap which manages ExecSharedRuntime objects. */
6834         if (rt_create) {
6835                 r = hashmap_put(u->manager->exec_shared_runtime_by_id, rt_create->id, rt_create);
6836                 if (r < 0) {
6837                         log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
6838                         return 0;
6839                 }
6840
6841                 rt_create->manager = u->manager;
6842
6843                 /* Avoid cleanup */
6844                 TAKE_PTR(rt_create);
6845         }
6846
6847         return 1;
6848 }
6849
6850 int exec_shared_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
6851         _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
6852         char *id = NULL;
6853         int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
6854         const char *p, *v = ASSERT_PTR(value);
6855         size_t n;
6856
6857         assert(m);
6858         assert(fds);
6859
6860         n = strcspn(v, " ");
6861         id = strndupa_safe(v, n);
6862         if (v[n] != ' ')
6863                 goto finalize;
6864         p = v + n + 1;
6865
6866         v = startswith(p, "tmp-dir=");
6867         if (v) {
6868                 n = strcspn(v, " ");
6869                 tmp_dir = strndup(v, n);
6870                 if (!tmp_dir)
6871                         return log_oom();
6872                 if (v[n] != ' ')
6873                         goto finalize;
6874                 p = v + n + 1;
6875         }
6876
6877         v = startswith(p, "var-tmp-dir=");
6878         if (v) {
6879                 n = strcspn(v, " ");
6880                 var_tmp_dir = strndup(v, n);
6881                 if (!var_tmp_dir)
6882                         return log_oom();
6883                 if (v[n] != ' ')
6884                         goto finalize;
6885                 p = v + n + 1;
6886         }
6887
6888         v = startswith(p, "netns-socket-0=");
6889         if (v) {
6890                 char *buf;
6891
6892                 n = strcspn(v, " ");
6893                 buf = strndupa_safe(v, n);
6894
6895                 netns_fdpair[0] = parse_fd(buf);
6896                 if (netns_fdpair[0] < 0)
6897                         return log_debug_errno(netns_fdpair[0], "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
6898                 if (!fdset_contains(fds, netns_fdpair[0]))
6899                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6900                                                "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair[0]);
6901                 netns_fdpair[0] = fdset_remove(fds, netns_fdpair[0]);
6902                 if (v[n] != ' ')
6903                         goto finalize;
6904                 p = v + n + 1;
6905         }
6906
6907         v = startswith(p, "netns-socket-1=");
6908         if (v) {
6909                 char *buf;
6910
6911                 n = strcspn(v, " ");
6912                 buf = strndupa_safe(v, n);
6913
6914                 netns_fdpair[1] = parse_fd(buf);
6915                 if (netns_fdpair[1] < 0)
6916                         return log_debug_errno(netns_fdpair[1], "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
6917                 if (!fdset_contains(fds, netns_fdpair[1]))
6918                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6919                                                "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair[1]);
6920                 netns_fdpair[1] = fdset_remove(fds, netns_fdpair[1]);
6921                 if (v[n] != ' ')
6922                         goto finalize;
6923                 p = v + n + 1;
6924         }
6925
6926         v = startswith(p, "ipcns-socket-0=");
6927         if (v) {
6928                 char *buf;
6929
6930                 n = strcspn(v, " ");
6931                 buf = strndupa_safe(v, n);
6932
6933                 ipcns_fdpair[0] = parse_fd(buf);
6934                 if (ipcns_fdpair[0] < 0)
6935                         return log_debug_errno(ipcns_fdpair[0], "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf);
6936                 if (!fdset_contains(fds, ipcns_fdpair[0]))
6937                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6938                                                "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair[0]);
6939                 ipcns_fdpair[0] = fdset_remove(fds, ipcns_fdpair[0]);
6940                 if (v[n] != ' ')
6941                         goto finalize;
6942                 p = v + n + 1;
6943         }
6944
6945         v = startswith(p, "ipcns-socket-1=");
6946         if (v) {
6947                 char *buf;
6948
6949                 n = strcspn(v, " ");
6950                 buf = strndupa_safe(v, n);
6951
6952                 ipcns_fdpair[1] = parse_fd(buf);
6953                 if (ipcns_fdpair[1] < 0)
6954                         return log_debug_errno(ipcns_fdpair[1], "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf);
6955                 if (!fdset_contains(fds, ipcns_fdpair[1]))
6956                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6957                                                "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair[1]);
6958                 ipcns_fdpair[1] = fdset_remove(fds, ipcns_fdpair[1]);
6959         }
6960
6961 finalize:
6962         r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
6963         if (r < 0)
6964                 return log_debug_errno(r, "Failed to add exec-runtime: %m");
6965         return 0;
6966 }
6967
6968 void exec_shared_runtime_vacuum(Manager *m) {
6969         ExecSharedRuntime *rt;
6970
6971         assert(m);
6972
6973         /* Free unreferenced ExecSharedRuntime objects. This is used after manager deserialization process. */
6974
6975         HASHMAP_FOREACH(rt, m->exec_shared_runtime_by_id) {
6976                 if (rt->n_ref > 0)
6977                         continue;
6978
6979                 (void) exec_shared_runtime_free(rt);
6980         }
6981 }
6982
6983 int exec_runtime_make(
6984                 const Unit *unit,
6985                 const ExecContext *context,
6986                 ExecSharedRuntime *shared,
6987                 DynamicCreds *creds,
6988                 ExecRuntime **ret) {
6989         _cleanup_close_pair_ int ephemeral_storage_socket[2] = PIPE_EBADF;
6990         _cleanup_free_ char *ephemeral = NULL;
6991         _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
6992         int r;
6993
6994         assert(unit);
6995         assert(context);
6996         assert(ret);
6997
6998         if (!shared && !creds && !exec_needs_ephemeral(context)) {
6999                 *ret = NULL;
7000                 return 0;
7001         }
7002
7003         if (exec_needs_ephemeral(context)) {
7004                 r = mkdir_p("/var/lib/systemd/ephemeral-trees", 0755);
7005                 if (r < 0)
7006                         return r;
7007
7008                 r = tempfn_random_child("/var/lib/systemd/ephemeral-trees", unit->id, &ephemeral);
7009                 if (r < 0)
7010                         return r;
7011
7012                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ephemeral_storage_socket) < 0)
7013                         return -errno;
7014         }
7015
7016         rt = new(ExecRuntime, 1);
7017         if (!rt)
7018                 return -ENOMEM;
7019
7020         *rt = (ExecRuntime) {
7021                 .shared = shared,
7022                 .dynamic_creds = creds,
7023                 .ephemeral_copy = TAKE_PTR(ephemeral),
7024                 .ephemeral_storage_socket[0] = TAKE_FD(ephemeral_storage_socket[0]),
7025                 .ephemeral_storage_socket[1] = TAKE_FD(ephemeral_storage_socket[1]),
7026         };
7027
7028         *ret = TAKE_PTR(rt);
7029         return 1;
7030 }
7031
7032 ExecRuntime* exec_runtime_free(ExecRuntime *rt) {
7033         if (!rt)
7034                 return NULL;
7035
7036         exec_shared_runtime_unref(rt->shared);
7037         dynamic_creds_unref(rt->dynamic_creds);
7038
7039         rt->ephemeral_copy = destroy_tree(rt->ephemeral_copy);
7040
7041         safe_close_pair(rt->ephemeral_storage_socket);
7042         return mfree(rt);
7043 }
7044
7045 ExecRuntime* exec_runtime_destroy(ExecRuntime *rt) {
7046         if (!rt)
7047                 return NULL;
7048
7049         rt->shared = exec_shared_runtime_destroy(rt->shared);
7050         rt->dynamic_creds = dynamic_creds_destroy(rt->dynamic_creds);
7051         return exec_runtime_free(rt);
7052 }
7053
7054 void exec_params_clear(ExecParameters *p) {
7055         if (!p)
7056                 return;
7057
7058         p->environment = strv_free(p->environment);
7059         p->fd_names = strv_free(p->fd_names);
7060         p->fds = mfree(p->fds);
7061         p->exec_fd = safe_close(p->exec_fd);
7062 }
7063
7064 void exec_directory_done(ExecDirectory *d) {
7065         if (!d)
7066                 return;
7067
7068         for (size_t i = 0; i < d->n_items; i++) {
7069                 free(d->items[i].path);
7070                 strv_free(d->items[i].symlinks);
7071         }
7072
7073         d->items = mfree(d->items);
7074         d->n_items = 0;
7075         d->mode = 0755;
7076 }
7077
7078 static ExecDirectoryItem *exec_directory_find(ExecDirectory *d, const char *path) {
7079         assert(d);
7080         assert(path);
7081
7082         for (size_t i = 0; i < d->n_items; i++)
7083                 if (path_equal(d->items[i].path, path))
7084                         return &d->items[i];
7085
7086         return NULL;
7087 }
7088
7089 int exec_directory_add(ExecDirectory *d, const char *path, const char *symlink) {
7090         _cleanup_strv_free_ char **s = NULL;
7091         _cleanup_free_ char *p = NULL;
7092         ExecDirectoryItem *existing;
7093         int r;
7094
7095         assert(d);
7096         assert(path);
7097
7098         existing = exec_directory_find(d, path);
7099         if (existing) {
7100                 r = strv_extend(&existing->symlinks, symlink);
7101                 if (r < 0)
7102                         return r;
7103
7104                 return 0; /* existing item is updated */
7105         }
7106
7107         p = strdup(path);
7108         if (!p)
7109                 return -ENOMEM;
7110
7111         if (symlink) {
7112                 s = strv_new(symlink);
7113                 if (!s)
7114                         return -ENOMEM;
7115         }
7116
7117         if (!GREEDY_REALLOC(d->items, d->n_items + 1))
7118                 return -ENOMEM;
7119
7120         d->items[d->n_items++] = (ExecDirectoryItem) {
7121                 .path = TAKE_PTR(p),
7122                 .symlinks = TAKE_PTR(s),
7123         };
7124
7125         return 1; /* new item is added */
7126 }
7127
7128 static int exec_directory_item_compare_func(const ExecDirectoryItem *a, const ExecDirectoryItem *b) {
7129         assert(a);
7130         assert(b);
7131
7132         return path_compare(a->path, b->path);
7133 }
7134
7135 void exec_directory_sort(ExecDirectory *d) {
7136         assert(d);
7137
7138         /* Sort the exec directories to make always parent directories processed at first in
7139          * setup_exec_directory(), e.g., even if StateDirectory=foo/bar foo, we need to create foo at first,
7140          * then foo/bar. Also, set .only_create flag if one of the parent directories is contained in the
7141          * list. See also comments in setup_exec_directory() and issue #24783. */
7142
7143         if (d->n_items <= 1)
7144                 return;
7145
7146         typesafe_qsort(d->items, d->n_items, exec_directory_item_compare_func);
7147
7148         for (size_t i = 1; i < d->n_items; i++)
7149                 for (size_t j = 0; j < i; j++)
7150                         if (path_startswith(d->items[i].path, d->items[j].path)) {
7151                                 d->items[i].only_create = true;
7152                                 break;
7153                         }
7154 }
7155
7156 ExecCleanMask exec_clean_mask_from_string(const char *s) {
7157         ExecDirectoryType t;
7158
7159         assert(s);
7160
7161         if (streq(s, "all"))
7162                 return EXEC_CLEAN_ALL;
7163         if (streq(s, "fdstore"))
7164                 return EXEC_CLEAN_FDSTORE;
7165
7166         t = exec_resource_type_from_string(s);
7167         if (t < 0)
7168                 return (ExecCleanMask) t;
7169
7170         return 1U << t;
7171 }
7172
7173 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
7174         [EXEC_INPUT_NULL] = "null",
7175         [EXEC_INPUT_TTY] = "tty",
7176         [EXEC_INPUT_TTY_FORCE] = "tty-force",
7177         [EXEC_INPUT_TTY_FAIL] = "tty-fail",
7178         [EXEC_INPUT_SOCKET] = "socket",
7179         [EXEC_INPUT_NAMED_FD] = "fd",
7180         [EXEC_INPUT_DATA] = "data",
7181         [EXEC_INPUT_FILE] = "file",
7182 };
7183
7184 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
7185
7186 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
7187         [EXEC_OUTPUT_INHERIT] = "inherit",
7188         [EXEC_OUTPUT_NULL] = "null",
7189         [EXEC_OUTPUT_TTY] = "tty",
7190         [EXEC_OUTPUT_KMSG] = "kmsg",
7191         [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
7192         [EXEC_OUTPUT_JOURNAL] = "journal",
7193         [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
7194         [EXEC_OUTPUT_SOCKET] = "socket",
7195         [EXEC_OUTPUT_NAMED_FD] = "fd",
7196         [EXEC_OUTPUT_FILE] = "file",
7197         [EXEC_OUTPUT_FILE_APPEND] = "append",
7198         [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
7199 };
7200
7201 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
7202
7203 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
7204         [EXEC_UTMP_INIT] = "init",
7205         [EXEC_UTMP_LOGIN] = "login",
7206         [EXEC_UTMP_USER] = "user",
7207 };
7208
7209 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
7210
7211 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
7212         [EXEC_PRESERVE_NO] = "no",
7213         [EXEC_PRESERVE_YES] = "yes",
7214         [EXEC_PRESERVE_RESTART] = "restart",
7215 };
7216
7217 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
7218
7219 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
7220 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7221         [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
7222         [EXEC_DIRECTORY_STATE] = "StateDirectory",
7223         [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
7224         [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
7225         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
7226 };
7227
7228 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
7229
7230 /* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
7231 static const char* const exec_directory_type_symlink_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7232         [EXEC_DIRECTORY_RUNTIME]       = "RuntimeDirectorySymlink",
7233         [EXEC_DIRECTORY_STATE]         = "StateDirectorySymlink",
7234         [EXEC_DIRECTORY_CACHE]         = "CacheDirectorySymlink",
7235         [EXEC_DIRECTORY_LOGS]          = "LogsDirectorySymlink",
7236         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectorySymlink",
7237 };
7238
7239 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink, ExecDirectoryType);
7240
7241 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
7242  * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
7243  * directories, specifically .timer units with their timestamp touch file. */
7244 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7245         [EXEC_DIRECTORY_RUNTIME] = "runtime",
7246         [EXEC_DIRECTORY_STATE] = "state",
7247         [EXEC_DIRECTORY_CACHE] = "cache",
7248         [EXEC_DIRECTORY_LOGS] = "logs",
7249         [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
7250 };
7251
7252 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
7253
7254 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
7255  * the service payload in. */
7256 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7257         [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
7258         [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
7259         [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
7260         [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
7261         [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
7262 };
7263
7264 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
7265
7266 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
7267         [EXEC_KEYRING_INHERIT] = "inherit",
7268         [EXEC_KEYRING_PRIVATE] = "private",
7269         [EXEC_KEYRING_SHARED] = "shared",
7270 };
7271
7272 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);