src/core/execute.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2
   3 #include <errno.h>
   4 #include <fcntl.h>
   5 #include <poll.h>
   6 #include <sys/eventfd.h>
   7 #include <sys/ioctl.h>
   8 #include <sys/mman.h>
   9 #include <sys/personality.h>
  10 #include <sys/prctl.h>
  11 #include <sys/shm.h>
  12 #include <sys/types.h>
  13 #include <sys/un.h>
  14 #include <unistd.h>
  15 #include <utmpx.h>
  16
  17 #if HAVE_PAM
  18 #include <security/pam_appl.h>
  19 #endif
  20
  21 #if HAVE_SELINUX
  22 #include <selinux/selinux.h>
  23 #endif
  24
  25 #if HAVE_SECCOMP
  26 #include <seccomp.h>
  27 #endif
  28
  29 #if HAVE_APPARMOR
  30 #include <sys/apparmor.h>
  31 #endif
  32
  33 #include "sd-messages.h"
  34
  35 #include "af-list.h"
  36 #include "alloc-util.h"
  37 #if HAVE_APPARMOR
  38 #include "apparmor-util.h"
  39 #endif
  40 #include "async.h"
  41 #include "barrier.h"
  42 #include "cap-list.h"
  43 #include "capability-util.h"
  44 #include "chown-recursive.h"
  45 #include "cgroup-setup.h"
  46 #include "cpu-set-util.h"
  47 #include "def.h"
  48 #include "env-file.h"
  49 #include "env-util.h"
  50 #include "errno-list.h"
  51 #include "execute.h"
  52 #include "exit-status.h"
  53 #include "fd-util.h"
  54 #include "format-util.h"
  55 #include "fs-util.h"
  56 #include "glob-util.h"
  57 #include "hexdecoct.h"
  58 #include "io-util.h"
  59 #include "ioprio.h"
  60 #include "label.h"
  61 #include "log.h"
  62 #include "macro.h"
  63 #include "manager.h"
  64 #include "memory-util.h"
  65 #include "missing_fs.h"
  66 #include "mkdir.h"
  67 #include "namespace.h"
  68 #include "parse-util.h"
  69 #include "path-util.h"
  70 #include "process-util.h"
  71 #include "rlimit-util.h"
  72 #include "rm-rf.h"
  73 #if HAVE_SECCOMP
  74 #include "seccomp-util.h"
  75 #endif
  76 #include "securebits-util.h"
  77 #include "selinux-util.h"
  78 #include "signal-util.h"
  79 #include "smack-util.h"
  80 #include "socket-util.h"
  81 #include "special.h"
  82 #include "stat-util.h"
  83 #include "string-table.h"
  84 #include "string-util.h"
  85 #include "strv.h"
  86 #include "syslog-util.h"
  87 #include "terminal-util.h"
  88 #include "umask-util.h"
  89 #include "unit.h"
  90 #include "user-util.h"
  91 #include "utmp-wtmp.h"
  92
  93 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
  94 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
  95
  96 #define SNDBUF_SIZE (8*1024*1024)
  97
  98 static int shift_fds(int fds[], size_t n_fds) {
  99         int start, restart_from;
 100
 101         if (n_fds <= 0)
 102                 return 0;
 103
 104         /* Modifies the fds array! (sorts it) */
 105
 106         assert(fds);
 107
 108         start = 0;
 109         for (;;) {
 110                 int i;
 111
 112                 restart_from = -1;
 113
 114                 for (i = start; i < (int) n_fds; i++) {
 115                         int nfd;
 116
 117                         /* Already at right index? */
 118                         if (fds[i] == i+3)
 119                                 continue;
 120
 121                         nfd = fcntl(fds[i], F_DUPFD, i + 3);
 122                         if (nfd < 0)
 123                                 return -errno;
 124
 125                         safe_close(fds[i]);
 126                         fds[i] = nfd;
 127
 128                         /* Hmm, the fd we wanted isn't free? Then
 129                          * let's remember that and try again from here */
 130                         if (nfd != i+3 && restart_from < 0)
 131                                 restart_from = i;
 132                 }
 133
 134                 if (restart_from < 0)
 135                         break;
 136
 137                 start = restart_from;
 138         }
 139
 140         return 0;
 141 }
 142
 143 static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
 144         size_t i, n_fds;
 145         int r;
 146
 147         n_fds = n_socket_fds + n_storage_fds;
 148         if (n_fds <= 0)
 149                 return 0;
 150
 151         assert(fds);
 152
 153         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
 154          * O_NONBLOCK only applies to socket activation though. */
 155
 156         for (i = 0; i < n_fds; i++) {
 157
 158                 if (i < n_socket_fds) {
 159                         r = fd_nonblock(fds[i], nonblock);
 160                         if (r < 0)
 161                                 return r;
 162                 }
 163
 164                 /* We unconditionally drop FD_CLOEXEC from the fds,
 165                  * since after all we want to pass these fds to our
 166                  * children */
 167
 168                 r = fd_cloexec(fds[i], false);
 169                 if (r < 0)
 170                         return r;
 171         }
 172
 173         return 0;
 174 }
 175
 176 static const char *exec_context_tty_path(const ExecContext *context) {
 177         assert(context);
 178
 179         if (context->stdio_as_fds)
 180                 return NULL;
 181
 182         if (context->tty_path)
 183                 return context->tty_path;
 184
 185         return "/dev/console";
 186 }
 187
 188 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
 189         const char *path;
 190
 191         assert(context);
 192
 193         path = exec_context_tty_path(context);
 194
 195         if (context->tty_vhangup) {
 196                 if (p && p->stdin_fd >= 0)
 197                         (void) terminal_vhangup_fd(p->stdin_fd);
 198                 else if (path)
 199                         (void) terminal_vhangup(path);
 200         }
 201
 202         if (context->tty_reset) {
 203                 if (p && p->stdin_fd >= 0)
 204                         (void) reset_terminal_fd(p->stdin_fd, true);
 205                 else if (path)
 206                         (void) reset_terminal(path);
 207         }
 208
 209         if (context->tty_vt_disallocate && path)
 210                 (void) vt_disallocate(path);
 211 }
 212
 213 static bool is_terminal_input(ExecInput i) {
 214         return IN_SET(i,
 215                       EXEC_INPUT_TTY,
 216                       EXEC_INPUT_TTY_FORCE,
 217                       EXEC_INPUT_TTY_FAIL);
 218 }
 219
 220 static bool is_terminal_output(ExecOutput o) {
 221         return IN_SET(o,
 222                       EXEC_OUTPUT_TTY,
 223                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 224                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 225 }
 226
 227 static bool is_kmsg_output(ExecOutput o) {
 228         return IN_SET(o,
 229                       EXEC_OUTPUT_KMSG,
 230                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 231 }
 232
 233 static bool exec_context_needs_term(const ExecContext *c) {
 234         assert(c);
 235
 236         /* Return true if the execution context suggests we should set $TERM to something useful. */
 237
 238         if (is_terminal_input(c->std_input))
 239                 return true;
 240
 241         if (is_terminal_output(c->std_output))
 242                 return true;
 243
 244         if (is_terminal_output(c->std_error))
 245                 return true;
 246
 247         return !!c->tty_path;
 248 }
 249
 250 static int open_null_as(int flags, int nfd) {
 251         int fd;
 252
 253         assert(nfd >= 0);
 254
 255         fd = open("/dev/null", flags|O_NOCTTY);
 256         if (fd < 0)
 257                 return -errno;
 258
 259         return move_fd(fd, nfd, false);
 260 }
 261
 262 static int connect_journal_socket(
 263                 int fd,
 264                 const char *log_namespace,
 265                 uid_t uid,
 266                 gid_t gid) {
 267
 268         union sockaddr_union sa;
 269         socklen_t sa_len;
 270         uid_t olduid = UID_INVALID;
 271         gid_t oldgid = GID_INVALID;
 272         const char *j;
 273         int r;
 274
 275         j = log_namespace ?
 276                 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
 277                 "/run/systemd/journal/stdout";
 278         r = sockaddr_un_set_path(&sa.un, j);
 279         if (r < 0)
 280                 return r;
 281         sa_len = r;
 282
 283         if (gid_is_valid(gid)) {
 284                 oldgid = getgid();
 285
 286                 if (setegid(gid) < 0)
 287                         return -errno;
 288         }
 289
 290         if (uid_is_valid(uid)) {
 291                 olduid = getuid();
 292
 293                 if (seteuid(uid) < 0) {
 294                         r = -errno;
 295                         goto restore_gid;
 296                 }
 297         }
 298
 299         r = connect(fd, &sa.sa, sa_len) < 0 ? -errno : 0;
 300
 301         /* If we fail to restore the uid or gid, things will likely
 302            fail later on. This should only happen if an LSM interferes. */
 303
 304         if (uid_is_valid(uid))
 305                 (void) seteuid(olduid);
 306
 307  restore_gid:
 308         if (gid_is_valid(gid))
 309                 (void) setegid(oldgid);
 310
 311         return r;
 312 }
 313
 314 static int connect_logger_as(
 315                 const Unit *unit,
 316                 const ExecContext *context,
 317                 const ExecParameters *params,
 318                 ExecOutput output,
 319                 const char *ident,
 320                 int nfd,
 321                 uid_t uid,
 322                 gid_t gid) {
 323
 324         _cleanup_close_ int fd = -1;
 325         int r;
 326
 327         assert(context);
 328         assert(params);
 329         assert(output < _EXEC_OUTPUT_MAX);
 330         assert(ident);
 331         assert(nfd >= 0);
 332
 333         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 334         if (fd < 0)
 335                 return -errno;
 336
 337         r = connect_journal_socket(fd, context->log_namespace, uid, gid);
 338         if (r < 0)
 339                 return r;
 340
 341         if (shutdown(fd, SHUT_RD) < 0)
 342                 return -errno;
 343
 344         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 345
 346         if (dprintf(fd,
 347                 "%s\n"
 348                 "%s\n"
 349                 "%i\n"
 350                 "%i\n"
 351                 "%i\n"
 352                 "%i\n"
 353                 "%i\n",
 354                 context->syslog_identifier ?: ident,
 355                 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
 356                 context->syslog_priority,
 357                 !!context->syslog_level_prefix,
 358                 false,
 359                 is_kmsg_output(output),
 360                 is_terminal_output(output)) < 0)
 361                 return -errno;
 362
 363         return move_fd(TAKE_FD(fd), nfd, false);
 364 }
 365
 366 static int open_terminal_as(const char *path, int flags, int nfd) {
 367         int fd;
 368
 369         assert(path);
 370         assert(nfd >= 0);
 371
 372         fd = open_terminal(path, flags | O_NOCTTY);
 373         if (fd < 0)
 374                 return fd;
 375
 376         return move_fd(fd, nfd, false);
 377 }
 378
 379 static int acquire_path(const char *path, int flags, mode_t mode) {
 380         union sockaddr_union sa;
 381         socklen_t sa_len;
 382         _cleanup_close_ int fd = -1;
 383         int r;
 384
 385         assert(path);
 386
 387         if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
 388                 flags |= O_CREAT;
 389
 390         fd = open(path, flags|O_NOCTTY, mode);
 391         if (fd >= 0)
 392                 return TAKE_FD(fd);
 393
 394         if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
 395                 return -errno;
 396
 397         /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
 398
 399         r = sockaddr_un_set_path(&sa.un, path);
 400         if (r < 0)
 401                 return r == -EINVAL ? -ENXIO : r;
 402         sa_len = r;
 403
 404         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 405         if (fd < 0)
 406                 return -errno;
 407
 408         if (connect(fd, &sa.sa, sa_len) < 0)
 409                 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
 410                                                            * indication that his wasn't an AF_UNIX socket after all */
 411
 412         if ((flags & O_ACCMODE) == O_RDONLY)
 413                 r = shutdown(fd, SHUT_WR);
 414         else if ((flags & O_ACCMODE) == O_WRONLY)
 415                 r = shutdown(fd, SHUT_RD);
 416         else
 417                 r = 0;
 418         if (r < 0)
 419                 return -errno;
 420
 421         return TAKE_FD(fd);
 422 }
 423
 424 static int fixup_input(
 425                 const ExecContext *context,
 426                 int socket_fd,
 427                 bool apply_tty_stdin) {
 428
 429         ExecInput std_input;
 430
 431         assert(context);
 432
 433         std_input = context->std_input;
 434
 435         if (is_terminal_input(std_input) && !apply_tty_stdin)
 436                 return EXEC_INPUT_NULL;
 437
 438         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 439                 return EXEC_INPUT_NULL;
 440
 441         if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
 442                 return EXEC_INPUT_NULL;
 443
 444         return std_input;
 445 }
 446
 447 static int fixup_output(ExecOutput std_output, int socket_fd) {
 448
 449         if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 450                 return EXEC_OUTPUT_INHERIT;
 451
 452         return std_output;
 453 }
 454
 455 static int setup_input(
 456                 const ExecContext *context,
 457                 const ExecParameters *params,
 458                 int socket_fd,
 459                 const int named_iofds[static 3]) {
 460
 461         ExecInput i;
 462
 463         assert(context);
 464         assert(params);
 465         assert(named_iofds);
 466
 467         if (params->stdin_fd >= 0) {
 468                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 469                         return -errno;
 470
 471                 /* Try to make this the controlling tty, if it is a tty, and reset it */
 472                 if (isatty(STDIN_FILENO)) {
 473                         (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 474                         (void) reset_terminal_fd(STDIN_FILENO, true);
 475                 }
 476
 477                 return STDIN_FILENO;
 478         }
 479
 480         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 481
 482         switch (i) {
 483
 484         case EXEC_INPUT_NULL:
 485                 return open_null_as(O_RDONLY, STDIN_FILENO);
 486
 487         case EXEC_INPUT_TTY:
 488         case EXEC_INPUT_TTY_FORCE:
 489         case EXEC_INPUT_TTY_FAIL: {
 490                 int fd;
 491
 492                 fd = acquire_terminal(exec_context_tty_path(context),
 493                                       i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
 494                                       i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
 495                                                                   ACQUIRE_TERMINAL_WAIT,
 496                                       USEC_INFINITY);
 497                 if (fd < 0)
 498                         return fd;
 499
 500                 return move_fd(fd, STDIN_FILENO, false);
 501         }
 502
 503         case EXEC_INPUT_SOCKET:
 504                 assert(socket_fd >= 0);
 505
 506                 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 507
 508         case EXEC_INPUT_NAMED_FD:
 509                 assert(named_iofds[STDIN_FILENO] >= 0);
 510
 511                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 512                 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 513
 514         case EXEC_INPUT_DATA: {
 515                 int fd;
 516
 517                 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
 518                 if (fd < 0)
 519                         return fd;
 520
 521                 return move_fd(fd, STDIN_FILENO, false);
 522         }
 523
 524         case EXEC_INPUT_FILE: {
 525                 bool rw;
 526                 int fd;
 527
 528                 assert(context->stdio_file[STDIN_FILENO]);
 529
 530                 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
 531                         (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
 532
 533                 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
 534                 if (fd < 0)
 535                         return fd;
 536
 537                 return move_fd(fd, STDIN_FILENO, false);
 538         }
 539
 540         default:
 541                 assert_not_reached("Unknown input type");
 542         }
 543 }
 544
 545 static bool can_inherit_stderr_from_stdout(
 546                 const ExecContext *context,
 547                 ExecOutput o,
 548                 ExecOutput e) {
 549
 550         assert(context);
 551
 552         /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
 553          * stderr fd */
 554
 555         if (e == EXEC_OUTPUT_INHERIT)
 556                 return true;
 557         if (e != o)
 558                 return false;
 559
 560         if (e == EXEC_OUTPUT_NAMED_FD)
 561                 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
 562
 563         if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND))
 564                 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
 565
 566         return true;
 567 }
 568
 569 static int setup_output(
 570                 const Unit *unit,
 571                 const ExecContext *context,
 572                 const ExecParameters *params,
 573                 int fileno,
 574                 int socket_fd,
 575                 const int named_iofds[static 3],
 576                 const char *ident,
 577                 uid_t uid,
 578                 gid_t gid,
 579                 dev_t *journal_stream_dev,
 580                 ino_t *journal_stream_ino) {
 581
 582         ExecOutput o;
 583         ExecInput i;
 584         int r;
 585
 586         assert(unit);
 587         assert(context);
 588         assert(params);
 589         assert(ident);
 590         assert(journal_stream_dev);
 591         assert(journal_stream_ino);
 592
 593         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 594
 595                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 596                         return -errno;
 597
 598                 return STDOUT_FILENO;
 599         }
 600
 601         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 602                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 603                         return -errno;
 604
 605                 return STDERR_FILENO;
 606         }
 607
 608         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 609         o = fixup_output(context->std_output, socket_fd);
 610
 611         if (fileno == STDERR_FILENO) {
 612                 ExecOutput e;
 613                 e = fixup_output(context->std_error, socket_fd);
 614
 615                 /* This expects the input and output are already set up */
 616
 617                 /* Don't change the stderr file descriptor if we inherit all
 618                  * the way and are not on a tty */
 619                 if (e == EXEC_OUTPUT_INHERIT &&
 620                     o == EXEC_OUTPUT_INHERIT &&
 621                     i == EXEC_INPUT_NULL &&
 622                     !is_terminal_input(context->std_input) &&
 623                     getppid () != 1)
 624                         return fileno;
 625
 626                 /* Duplicate from stdout if possible */
 627                 if (can_inherit_stderr_from_stdout(context, o, e))
 628                         return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
 629
 630                 o = e;
 631
 632         } else if (o == EXEC_OUTPUT_INHERIT) {
 633                 /* If input got downgraded, inherit the original value */
 634                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 635                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 636
 637                 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
 638                 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
 639                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 640
 641                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 642                 if (getppid() != 1)
 643                         return fileno;
 644
 645                 /* We need to open /dev/null here anew, to get the right access mode. */
 646                 return open_null_as(O_WRONLY, fileno);
 647         }
 648
 649         switch (o) {
 650
 651         case EXEC_OUTPUT_NULL:
 652                 return open_null_as(O_WRONLY, fileno);
 653
 654         case EXEC_OUTPUT_TTY:
 655                 if (is_terminal_input(i))
 656                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 657
 658                 /* We don't reset the terminal if this is just about output */
 659                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 660
 661         case EXEC_OUTPUT_KMSG:
 662         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 663         case EXEC_OUTPUT_JOURNAL:
 664         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 665                 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
 666                 if (r < 0) {
 667                         log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
 668                         r = open_null_as(O_WRONLY, fileno);
 669                 } else {
 670                         struct stat st;
 671
 672                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 673                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 674                          * services to detect whether they are connected to the journal or not.
 675                          *
 676                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
 677                          * about STDERR as that's usually the best way to do logging. */
 678
 679                         if (fstat(fileno, &st) >= 0 &&
 680                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
 681                                 *journal_stream_dev = st.st_dev;
 682                                 *journal_stream_ino = st.st_ino;
 683                         }
 684                 }
 685                 return r;
 686
 687         case EXEC_OUTPUT_SOCKET:
 688                 assert(socket_fd >= 0);
 689
 690                 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
 691
 692         case EXEC_OUTPUT_NAMED_FD:
 693                 assert(named_iofds[fileno] >= 0);
 694
 695                 (void) fd_nonblock(named_iofds[fileno], false);
 696                 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
 697
 698         case EXEC_OUTPUT_FILE:
 699         case EXEC_OUTPUT_FILE_APPEND: {
 700                 bool rw;
 701                 int fd, flags;
 702
 703                 assert(context->stdio_file[fileno]);
 704
 705                 rw = context->std_input == EXEC_INPUT_FILE &&
 706                         streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
 707
 708                 if (rw)
 709                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 710
 711                 flags = O_WRONLY;
 712                 if (o == EXEC_OUTPUT_FILE_APPEND)
 713                         flags |= O_APPEND;
 714
 715                 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
 716                 if (fd < 0)
 717                         return fd;
 718
 719                 return move_fd(fd, fileno, 0);
 720         }
 721
 722         default:
 723                 assert_not_reached("Unknown error type");
 724         }
 725 }
 726
 727 static int chown_terminal(int fd, uid_t uid) {
 728         int r;
 729
 730         assert(fd >= 0);
 731
 732         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 733         if (isatty(fd) < 1) {
 734                 if (IN_SET(errno, EINVAL, ENOTTY))
 735                         return 0; /* not a tty */
 736
 737                 return -errno;
 738         }
 739
 740         /* This might fail. What matters are the results. */
 741         r = fchmod_and_chown(fd, TTY_MODE, uid, -1);
 742         if (r < 0)
 743                 return r;
 744
 745         return 1;
 746 }
 747
 748 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
 749         _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
 750         int r;
 751
 752         assert(_saved_stdin);
 753         assert(_saved_stdout);
 754
 755         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 756         if (saved_stdin < 0)
 757                 return -errno;
 758
 759         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
 760         if (saved_stdout < 0)
 761                 return -errno;
 762
 763         fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
 764         if (fd < 0)
 765                 return fd;
 766
 767         r = chown_terminal(fd, getuid());
 768         if (r < 0)
 769                 return r;
 770
 771         r = reset_terminal_fd(fd, true);
 772         if (r < 0)
 773                 return r;
 774
 775         r = rearrange_stdio(fd, fd, STDERR_FILENO);
 776         fd = -1;
 777         if (r < 0)
 778                 return r;
 779
 780         *_saved_stdin = saved_stdin;
 781         *_saved_stdout = saved_stdout;
 782
 783         saved_stdin = saved_stdout = -1;
 784
 785         return 0;
 786 }
 787
 788 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
 789         assert(err < 0);
 790
 791         if (err == -ETIMEDOUT)
 792                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
 793         else {
 794                 errno = -err;
 795                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
 796         }
 797 }
 798
 799 static void write_confirm_error(int err, const char *vc, const Unit *u) {
 800         _cleanup_close_ int fd = -1;
 801
 802         assert(vc);
 803
 804         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 805         if (fd < 0)
 806                 return;
 807
 808         write_confirm_error_fd(err, fd, u);
 809 }
 810
 811 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 812         int r = 0;
 813
 814         assert(saved_stdin);
 815         assert(saved_stdout);
 816
 817         release_terminal();
 818
 819         if (*saved_stdin >= 0)
 820                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 821                         r = -errno;
 822
 823         if (*saved_stdout >= 0)
 824                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 825                         r = -errno;
 826
 827         *saved_stdin = safe_close(*saved_stdin);
 828         *saved_stdout = safe_close(*saved_stdout);
 829
 830         return r;
 831 }
 832
 833 enum {
 834         CONFIRM_PRETEND_FAILURE = -1,
 835         CONFIRM_PRETEND_SUCCESS =  0,
 836         CONFIRM_EXECUTE = 1,
 837 };
 838
 839 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
 840         int saved_stdout = -1, saved_stdin = -1, r;
 841         _cleanup_free_ char *e = NULL;
 842         char c;
 843
 844         /* For any internal errors, assume a positive response. */
 845         r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
 846         if (r < 0) {
 847                 write_confirm_error(r, vc, u);
 848                 return CONFIRM_EXECUTE;
 849         }
 850
 851         /* confirm_spawn might have been disabled while we were sleeping. */
 852         if (manager_is_confirm_spawn_disabled(u->manager)) {
 853                 r = 1;
 854                 goto restore_stdio;
 855         }
 856
 857         e = ellipsize(cmdline, 60, 100);
 858         if (!e) {
 859                 log_oom();
 860                 r = CONFIRM_EXECUTE;
 861                 goto restore_stdio;
 862         }
 863
 864         for (;;) {
 865                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 866                 if (r < 0) {
 867                         write_confirm_error_fd(r, STDOUT_FILENO, u);
 868                         r = CONFIRM_EXECUTE;
 869                         goto restore_stdio;
 870                 }
 871
 872                 switch (c) {
 873                 case 'c':
 874                         printf("Resuming normal execution.\n");
 875                         manager_disable_confirm_spawn();
 876                         r = 1;
 877                         break;
 878                 case 'D':
 879                         unit_dump(u, stdout, "  ");
 880                         continue; /* ask again */
 881                 case 'f':
 882                         printf("Failing execution.\n");
 883                         r = CONFIRM_PRETEND_FAILURE;
 884                         break;
 885                 case 'h':
 886                         printf("  c - continue, proceed without asking anymore\n"
 887                                "  D - dump, show the state of the unit\n"
 888                                "  f - fail, don't execute the command and pretend it failed\n"
 889                                "  h - help\n"
 890                                "  i - info, show a short summary of the unit\n"
 891                                "  j - jobs, show jobs that are in progress\n"
 892                                "  s - skip, don't execute the command and pretend it succeeded\n"
 893                                "  y - yes, execute the command\n");
 894                         continue; /* ask again */
 895                 case 'i':
 896                         printf("  Description: %s\n"
 897                                "  Unit:        %s\n"
 898                                "  Command:     %s\n",
 899                                u->id, u->description, cmdline);
 900                         continue; /* ask again */
 901                 case 'j':
 902                         manager_dump_jobs(u->manager, stdout, "  ");
 903                         continue; /* ask again */
 904                 case 'n':
 905                         /* 'n' was removed in favor of 'f'. */
 906                         printf("Didn't understand 'n', did you mean 'f'?\n");
 907                         continue; /* ask again */
 908                 case 's':
 909                         printf("Skipping execution.\n");
 910                         r = CONFIRM_PRETEND_SUCCESS;
 911                         break;
 912                 case 'y':
 913                         r = CONFIRM_EXECUTE;
 914                         break;
 915                 default:
 916                         assert_not_reached("Unhandled choice");
 917                 }
 918                 break;
 919         }
 920
 921 restore_stdio:
 922         restore_confirm_stdio(&saved_stdin, &saved_stdout);
 923         return r;
 924 }
 925
 926 static int get_fixed_user(const ExecContext *c, const char **user,
 927                           uid_t *uid, gid_t *gid,
 928                           const char **home, const char **shell) {
 929         int r;
 930         const char *name;
 931
 932         assert(c);
 933
 934         if (!c->user)
 935                 return 0;
 936
 937         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
 938          * (i.e. are "/" or "/bin/nologin"). */
 939
 940         name = c->user;
 941         r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
 942         if (r < 0)
 943                 return r;
 944
 945         *user = name;
 946         return 0;
 947 }
 948
 949 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
 950         int r;
 951         const char *name;
 952
 953         assert(c);
 954
 955         if (!c->group)
 956                 return 0;
 957
 958         name = c->group;
 959         r = get_group_creds(&name, gid, 0);
 960         if (r < 0)
 961                 return r;
 962
 963         *group = name;
 964         return 0;
 965 }
 966
 967 static int get_supplementary_groups(const ExecContext *c, const char *user,
 968                                     const char *group, gid_t gid,
 969                                     gid_t **supplementary_gids, int *ngids) {
 970         char **i;
 971         int r, k = 0;
 972         int ngroups_max;
 973         bool keep_groups = false;
 974         gid_t *groups = NULL;
 975         _cleanup_free_ gid_t *l_gids = NULL;
 976
 977         assert(c);
 978
 979         /*
 980          * If user is given, then lookup GID and supplementary groups list.
 981          * We avoid NSS lookups for gid=0. Also we have to initialize groups
 982          * here and as early as possible so we keep the list of supplementary
 983          * groups of the caller.
 984          */
 985         if (user && gid_is_valid(gid) && gid != 0) {
 986                 /* First step, initialize groups from /etc/groups */
 987                 if (initgroups(user, gid) < 0)
 988                         return -errno;
 989
 990                 keep_groups = true;
 991         }
 992
 993         if (strv_isempty(c->supplementary_groups))
 994                 return 0;
 995
 996         /*
 997          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
 998          * be positive, otherwise fail.
 999          */
1000         errno = 0;
1001         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1002         if (ngroups_max <= 0)
1003                 return errno_or_else(EOPNOTSUPP);
1004
1005         l_gids = new(gid_t, ngroups_max);
1006         if (!l_gids)
1007                 return -ENOMEM;
1008
1009         if (keep_groups) {
1010                 /*
1011                  * Lookup the list of groups that the user belongs to, we
1012                  * avoid NSS lookups here too for gid=0.
1013                  */
1014                 k = ngroups_max;
1015                 if (getgrouplist(user, gid, l_gids, &k) < 0)
1016                         return -EINVAL;
1017         } else
1018                 k = 0;
1019
1020         STRV_FOREACH(i, c->supplementary_groups) {
1021                 const char *g;
1022
1023                 if (k >= ngroups_max)
1024                         return -E2BIG;
1025
1026                 g = *i;
1027                 r = get_group_creds(&g, l_gids+k, 0);
1028                 if (r < 0)
1029                         return r;
1030
1031                 k++;
1032         }
1033
1034         /*
1035          * Sets ngids to zero to drop all supplementary groups, happens
1036          * when we are under root and SupplementaryGroups= is empty.
1037          */
1038         if (k == 0) {
1039                 *ngids = 0;
1040                 return 0;
1041         }
1042
1043         /* Otherwise get the final list of supplementary groups */
1044         groups = memdup(l_gids, sizeof(gid_t) * k);
1045         if (!groups)
1046                 return -ENOMEM;
1047
1048         *supplementary_gids = groups;
1049         *ngids = k;
1050
1051         groups = NULL;
1052
1053         return 0;
1054 }
1055
1056 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1057         int r;
1058
1059         /* Handle SupplementaryGroups= if it is not empty */
1060         if (ngids > 0) {
1061                 r = maybe_setgroups(ngids, supplementary_gids);
1062                 if (r < 0)
1063                         return r;
1064         }
1065
1066         if (gid_is_valid(gid)) {
1067                 /* Then set our gids */
1068                 if (setresgid(gid, gid, gid) < 0)
1069                         return -errno;
1070         }
1071
1072         return 0;
1073 }
1074
1075 static int enforce_user(const ExecContext *context, uid_t uid) {
1076         assert(context);
1077
1078         if (!uid_is_valid(uid))
1079                 return 0;
1080
1081         /* Sets (but doesn't look up) the uid and make sure we keep the
1082          * capabilities while doing so. */
1083
1084         if (context->capability_ambient_set != 0) {
1085
1086                 /* First step: If we need to keep capabilities but
1087                  * drop privileges we need to make sure we keep our
1088                  * caps, while we drop privileges. */
1089                 if (uid != 0) {
1090                         int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
1091
1092                         if (prctl(PR_GET_SECUREBITS) != sb)
1093                                 if (prctl(PR_SET_SECUREBITS, sb) < 0)
1094                                         return -errno;
1095                 }
1096         }
1097
1098         /* Second step: actually set the uids */
1099         if (setresuid(uid, uid, uid) < 0)
1100                 return -errno;
1101
1102         /* At this point we should have all necessary capabilities but
1103            are otherwise a normal user. However, the caps might got
1104            corrupted due to the setresuid() so we need clean them up
1105            later. This is done outside of this call. */
1106
1107         return 0;
1108 }
1109
1110 #if HAVE_PAM
1111
1112 static int null_conv(
1113                 int num_msg,
1114                 const struct pam_message **msg,
1115                 struct pam_response **resp,
1116                 void *appdata_ptr) {
1117
1118         /* We don't support conversations */
1119
1120         return PAM_CONV_ERR;
1121 }
1122
1123 #endif
1124
1125 static int setup_pam(
1126                 const char *name,
1127                 const char *user,
1128                 uid_t uid,
1129                 gid_t gid,
1130                 const char *tty,
1131                 char ***env,
1132                 const int fds[], size_t n_fds) {
1133
1134 #if HAVE_PAM
1135
1136         static const struct pam_conv conv = {
1137                 .conv = null_conv,
1138                 .appdata_ptr = NULL
1139         };
1140
1141         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1142         pam_handle_t *handle = NULL;
1143         sigset_t old_ss;
1144         int pam_code = PAM_SUCCESS, r;
1145         char **nv, **e = NULL;
1146         bool close_session = false;
1147         pid_t pam_pid = 0, parent_pid;
1148         int flags = 0;
1149
1150         assert(name);
1151         assert(user);
1152         assert(env);
1153
1154         /* We set up PAM in the parent process, then fork. The child
1155          * will then stay around until killed via PR_GET_PDEATHSIG or
1156          * systemd via the cgroup logic. It will then remove the PAM
1157          * session again. The parent process will exec() the actual
1158          * daemon. We do things this way to ensure that the main PID
1159          * of the daemon is the one we initially fork()ed. */
1160
1161         r = barrier_create(&barrier);
1162         if (r < 0)
1163                 goto fail;
1164
1165         if (log_get_max_level() < LOG_DEBUG)
1166                 flags |= PAM_SILENT;
1167
1168         pam_code = pam_start(name, user, &conv, &handle);
1169         if (pam_code != PAM_SUCCESS) {
1170                 handle = NULL;
1171                 goto fail;
1172         }
1173
1174         if (!tty) {
1175                 _cleanup_free_ char *q = NULL;
1176
1177                 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1178                  * out if that's the case, and read the TTY off it. */
1179
1180                 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1181                         tty = strjoina("/dev/", q);
1182         }
1183
1184         if (tty) {
1185                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1186                 if (pam_code != PAM_SUCCESS)
1187                         goto fail;
1188         }
1189
1190         STRV_FOREACH(nv, *env) {
1191                 pam_code = pam_putenv(handle, *nv);
1192                 if (pam_code != PAM_SUCCESS)
1193                         goto fail;
1194         }
1195
1196         pam_code = pam_acct_mgmt(handle, flags);
1197         if (pam_code != PAM_SUCCESS)
1198                 goto fail;
1199
1200         pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1201         if (pam_code != PAM_SUCCESS)
1202                 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
1203
1204         pam_code = pam_open_session(handle, flags);
1205         if (pam_code != PAM_SUCCESS)
1206                 goto fail;
1207
1208         close_session = true;
1209
1210         e = pam_getenvlist(handle);
1211         if (!e) {
1212                 pam_code = PAM_BUF_ERR;
1213                 goto fail;
1214         }
1215
1216         /* Block SIGTERM, so that we know that it won't get lost in
1217          * the child */
1218
1219         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1220
1221         parent_pid = getpid_cached();
1222
1223         r = safe_fork("(sd-pam)", 0, &pam_pid);
1224         if (r < 0)
1225                 goto fail;
1226         if (r == 0) {
1227                 int sig, ret = EXIT_PAM;
1228
1229                 /* The child's job is to reset the PAM session on
1230                  * termination */
1231                 barrier_set_role(&barrier, BARRIER_CHILD);
1232
1233                 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only those fds
1234                  * are open here that have been opened by PAM. */
1235                 (void) close_many(fds, n_fds);
1236
1237                 /* Drop privileges - we don't need any to pam_close_session
1238                  * and this will make PR_SET_PDEATHSIG work in most cases.
1239                  * If this fails, ignore the error - but expect sd-pam threads
1240                  * to fail to exit normally */
1241
1242                 r = maybe_setgroups(0, NULL);
1243                 if (r < 0)
1244                         log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1245                 if (setresgid(gid, gid, gid) < 0)
1246                         log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1247                 if (setresuid(uid, uid, uid) < 0)
1248                         log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1249
1250                 (void) ignore_signals(SIGPIPE, -1);
1251
1252                 /* Wait until our parent died. This will only work if
1253                  * the above setresuid() succeeds, otherwise the kernel
1254                  * will not allow unprivileged parents kill their privileged
1255                  * children this way. We rely on the control groups kill logic
1256                  * to do the rest for us. */
1257                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1258                         goto child_finish;
1259
1260                 /* Tell the parent that our setup is done. This is especially
1261                  * important regarding dropping privileges. Otherwise, unit
1262                  * setup might race against our setresuid(2) call.
1263                  *
1264                  * If the parent aborted, we'll detect this below, hence ignore
1265                  * return failure here. */
1266                 (void) barrier_place(&barrier);
1267
1268                 /* Check if our parent process might already have died? */
1269                 if (getppid() == parent_pid) {
1270                         sigset_t ss;
1271
1272                         assert_se(sigemptyset(&ss) >= 0);
1273                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1274
1275                         for (;;) {
1276                                 if (sigwait(&ss, &sig) < 0) {
1277                                         if (errno == EINTR)
1278                                                 continue;
1279
1280                                         goto child_finish;
1281                                 }
1282
1283                                 assert(sig == SIGTERM);
1284                                 break;
1285                         }
1286                 }
1287
1288                 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1289                 if (pam_code != PAM_SUCCESS)
1290                         goto child_finish;
1291
1292                 /* If our parent died we'll end the session */
1293                 if (getppid() != parent_pid) {
1294                         pam_code = pam_close_session(handle, flags);
1295                         if (pam_code != PAM_SUCCESS)
1296                                 goto child_finish;
1297                 }
1298
1299                 ret = 0;
1300
1301         child_finish:
1302                 pam_end(handle, pam_code | flags);
1303                 _exit(ret);
1304         }
1305
1306         barrier_set_role(&barrier, BARRIER_PARENT);
1307
1308         /* If the child was forked off successfully it will do all the
1309          * cleanups, so forget about the handle here. */
1310         handle = NULL;
1311
1312         /* Unblock SIGTERM again in the parent */
1313         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1314
1315         /* We close the log explicitly here, since the PAM modules
1316          * might have opened it, but we don't want this fd around. */
1317         closelog();
1318
1319         /* Synchronously wait for the child to initialize. We don't care for
1320          * errors as we cannot recover. However, warn loudly if it happens. */
1321         if (!barrier_place_and_sync(&barrier))
1322                 log_error("PAM initialization failed");
1323
1324         return strv_free_and_replace(*env, e);
1325
1326 fail:
1327         if (pam_code != PAM_SUCCESS) {
1328                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1329                 r = -EPERM;  /* PAM errors do not map to errno */
1330         } else
1331                 log_error_errno(r, "PAM failed: %m");
1332
1333         if (handle) {
1334                 if (close_session)
1335                         pam_code = pam_close_session(handle, flags);
1336
1337                 pam_end(handle, pam_code | flags);
1338         }
1339
1340         strv_free(e);
1341         closelog();
1342
1343         return r;
1344 #else
1345         return 0;
1346 #endif
1347 }
1348
1349 static void rename_process_from_path(const char *path) {
1350         char process_name[11];
1351         const char *p;
1352         size_t l;
1353
1354         /* This resulting string must fit in 10 chars (i.e. the length
1355          * of "/sbin/init") to look pretty in /bin/ps */
1356
1357         p = basename(path);
1358         if (isempty(p)) {
1359                 rename_process("(...)");
1360                 return;
1361         }
1362
1363         l = strlen(p);
1364         if (l > 8) {
1365                 /* The end of the process name is usually more
1366                  * interesting, since the first bit might just be
1367                  * "systemd-" */
1368                 p = p + l - 8;
1369                 l = 8;
1370         }
1371
1372         process_name[0] = '(';
1373         memcpy(process_name+1, p, l);
1374         process_name[1+l] = ')';
1375         process_name[1+l+1] = 0;
1376
1377         rename_process(process_name);
1378 }
1379
1380 static bool context_has_address_families(const ExecContext *c) {
1381         assert(c);
1382
1383         return c->address_families_allow_list ||
1384                 !set_isempty(c->address_families);
1385 }
1386
1387 static bool context_has_syscall_filters(const ExecContext *c) {
1388         assert(c);
1389
1390         return c->syscall_allow_list ||
1391                 !hashmap_isempty(c->syscall_filter);
1392 }
1393
1394 static bool context_has_no_new_privileges(const ExecContext *c) {
1395         assert(c);
1396
1397         if (c->no_new_privileges)
1398                 return true;
1399
1400         if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1401                 return false;
1402
1403         /* We need NNP if we have any form of seccomp and are unprivileged */
1404         return context_has_address_families(c) ||
1405                 c->memory_deny_write_execute ||
1406                 c->restrict_realtime ||
1407                 c->restrict_suid_sgid ||
1408                 exec_context_restrict_namespaces_set(c) ||
1409                 c->protect_clock ||
1410                 c->protect_kernel_tunables ||
1411                 c->protect_kernel_modules ||
1412                 c->protect_kernel_logs ||
1413                 c->private_devices ||
1414                 context_has_syscall_filters(c) ||
1415                 !set_isempty(c->syscall_archs) ||
1416                 c->lock_personality ||
1417                 c->protect_hostname;
1418 }
1419
1420 #if HAVE_SECCOMP
1421
1422 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1423
1424         if (is_seccomp_available())
1425                 return false;
1426
1427         log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1428         return true;
1429 }
1430
1431 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1432         uint32_t negative_action, default_action, action;
1433         int r;
1434
1435         assert(u);
1436         assert(c);
1437
1438         if (!context_has_syscall_filters(c))
1439                 return 0;
1440
1441         if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1442                 return 0;
1443
1444         negative_action = c->syscall_errno == 0 ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1445
1446         if (c->syscall_allow_list) {
1447                 default_action = negative_action;
1448                 action = SCMP_ACT_ALLOW;
1449         } else {
1450                 default_action = SCMP_ACT_ALLOW;
1451                 action = negative_action;
1452         }
1453
1454         if (needs_ambient_hack) {
1455                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1456                 if (r < 0)
1457                         return r;
1458         }
1459
1460         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1461 }
1462
1463 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1464         assert(u);
1465         assert(c);
1466
1467         if (set_isempty(c->syscall_archs))
1468                 return 0;
1469
1470         if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1471                 return 0;
1472
1473         return seccomp_restrict_archs(c->syscall_archs);
1474 }
1475
1476 static int apply_address_families(const Unit* u, const ExecContext *c) {
1477         assert(u);
1478         assert(c);
1479
1480         if (!context_has_address_families(c))
1481                 return 0;
1482
1483         if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1484                 return 0;
1485
1486         return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1487 }
1488
1489 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1490         assert(u);
1491         assert(c);
1492
1493         if (!c->memory_deny_write_execute)
1494                 return 0;
1495
1496         if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1497                 return 0;
1498
1499         return seccomp_memory_deny_write_execute();
1500 }
1501
1502 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1503         assert(u);
1504         assert(c);
1505
1506         if (!c->restrict_realtime)
1507                 return 0;
1508
1509         if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1510                 return 0;
1511
1512         return seccomp_restrict_realtime();
1513 }
1514
1515 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1516         assert(u);
1517         assert(c);
1518
1519         if (!c->restrict_suid_sgid)
1520                 return 0;
1521
1522         if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1523                 return 0;
1524
1525         return seccomp_restrict_suid_sgid();
1526 }
1527
1528 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1529         assert(u);
1530         assert(c);
1531
1532         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1533          * let's protect even those systems where this is left on in the kernel. */
1534
1535         if (!c->protect_kernel_tunables)
1536                 return 0;
1537
1538         if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1539                 return 0;
1540
1541         return seccomp_protect_sysctl();
1542 }
1543
1544 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1545         assert(u);
1546         assert(c);
1547
1548         /* Turn off module syscalls on ProtectKernelModules=yes */
1549
1550         if (!c->protect_kernel_modules)
1551                 return 0;
1552
1553         if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1554                 return 0;
1555
1556         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1557 }
1558
1559 static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1560         assert(u);
1561         assert(c);
1562
1563         if (!c->protect_kernel_logs)
1564                 return 0;
1565
1566         if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1567                 return 0;
1568
1569         return seccomp_protect_syslog();
1570 }
1571
1572 static int apply_protect_clock(const Unit *u, const ExecContext *c) {
1573         assert(u);
1574         assert(c);
1575
1576         if (!c->protect_clock)
1577                 return 0;
1578
1579         if (skip_seccomp_unavailable(u, "ProtectClock="))
1580                 return 0;
1581
1582         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1583 }
1584
1585 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1586         assert(u);
1587         assert(c);
1588
1589         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1590
1591         if (!c->private_devices)
1592                 return 0;
1593
1594         if (skip_seccomp_unavailable(u, "PrivateDevices="))
1595                 return 0;
1596
1597         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1598 }
1599
1600 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1601         assert(u);
1602         assert(c);
1603
1604         if (!exec_context_restrict_namespaces_set(c))
1605                 return 0;
1606
1607         if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1608                 return 0;
1609
1610         return seccomp_restrict_namespaces(c->restrict_namespaces);
1611 }
1612
1613 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1614         unsigned long personality;
1615         int r;
1616
1617         assert(u);
1618         assert(c);
1619
1620         if (!c->lock_personality)
1621                 return 0;
1622
1623         if (skip_seccomp_unavailable(u, "LockPersonality="))
1624                 return 0;
1625
1626         personality = c->personality;
1627
1628         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1629         if (personality == PERSONALITY_INVALID) {
1630
1631                 r = opinionated_personality(&personality);
1632                 if (r < 0)
1633                         return r;
1634         }
1635
1636         return seccomp_lock_personality(personality);
1637 }
1638
1639 #endif
1640
1641 static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
1642         assert(u);
1643         assert(c);
1644
1645         if (!c->protect_hostname)
1646                 return 0;
1647
1648         if (ns_type_supported(NAMESPACE_UTS)) {
1649                 if (unshare(CLONE_NEWUTS) < 0) {
1650                         if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1651                                 *ret_exit_status = EXIT_NAMESPACE;
1652                                 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1653                         }
1654
1655                         log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1656                 }
1657         } else
1658                 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1659
1660 #if HAVE_SECCOMP
1661         int r;
1662
1663         if (skip_seccomp_unavailable(u, "ProtectHostname="))
1664                 return 0;
1665
1666         r = seccomp_protect_hostname();
1667         if (r < 0) {
1668                 *ret_exit_status = EXIT_SECCOMP;
1669                 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1670         }
1671 #endif
1672
1673         return 0;
1674 }
1675
1676 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1677         assert(idle_pipe);
1678
1679         idle_pipe[1] = safe_close(idle_pipe[1]);
1680         idle_pipe[2] = safe_close(idle_pipe[2]);
1681
1682         if (idle_pipe[0] >= 0) {
1683                 int r;
1684
1685                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1686
1687                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1688                         ssize_t n;
1689
1690                         /* Signal systemd that we are bored and want to continue. */
1691                         n = write(idle_pipe[3], "x", 1);
1692                         if (n > 0)
1693                                 /* Wait for systemd to react to the signal above. */
1694                                 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1695                 }
1696
1697                 idle_pipe[0] = safe_close(idle_pipe[0]);
1698
1699         }
1700
1701         idle_pipe[3] = safe_close(idle_pipe[3]);
1702 }
1703
1704 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1705
1706 static int build_environment(
1707                 const Unit *u,
1708                 const ExecContext *c,
1709                 const ExecParameters *p,
1710                 size_t n_fds,
1711                 const char *home,
1712                 const char *username,
1713                 const char *shell,
1714                 dev_t journal_stream_dev,
1715                 ino_t journal_stream_ino,
1716                 char ***ret) {
1717
1718         _cleanup_strv_free_ char **our_env = NULL;
1719         ExecDirectoryType t;
1720         size_t n_env = 0;
1721         char *x;
1722
1723         assert(u);
1724         assert(c);
1725         assert(p);
1726         assert(ret);
1727
1728         our_env = new0(char*, 15 + _EXEC_DIRECTORY_TYPE_MAX);
1729         if (!our_env)
1730                 return -ENOMEM;
1731
1732         if (n_fds > 0) {
1733                 _cleanup_free_ char *joined = NULL;
1734
1735                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1736                         return -ENOMEM;
1737                 our_env[n_env++] = x;
1738
1739                 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1740                         return -ENOMEM;
1741                 our_env[n_env++] = x;
1742
1743                 joined = strv_join(p->fd_names, ":");
1744                 if (!joined)
1745                         return -ENOMEM;
1746
1747                 x = strjoin("LISTEN_FDNAMES=", joined);
1748                 if (!x)
1749                         return -ENOMEM;
1750                 our_env[n_env++] = x;
1751         }
1752
1753         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1754                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1755                         return -ENOMEM;
1756                 our_env[n_env++] = x;
1757
1758                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1759                         return -ENOMEM;
1760                 our_env[n_env++] = x;
1761         }
1762
1763         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1764          * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1765          * check the database directly. */
1766         if (p->flags & EXEC_NSS_BYPASS_BUS) {
1767                 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1768                 if (!x)
1769                         return -ENOMEM;
1770                 our_env[n_env++] = x;
1771         }
1772
1773         if (home) {
1774                 x = strjoin("HOME=", home);
1775                 if (!x)
1776                         return -ENOMEM;
1777
1778                 path_simplify(x + 5, true);
1779                 our_env[n_env++] = x;
1780         }
1781
1782         if (username) {
1783                 x = strjoin("LOGNAME=", username);
1784                 if (!x)
1785                         return -ENOMEM;
1786                 our_env[n_env++] = x;
1787
1788                 x = strjoin("USER=", username);
1789                 if (!x)
1790                         return -ENOMEM;
1791                 our_env[n_env++] = x;
1792         }
1793
1794         if (shell) {
1795                 x = strjoin("SHELL=", shell);
1796                 if (!x)
1797                         return -ENOMEM;
1798
1799                 path_simplify(x + 6, true);
1800                 our_env[n_env++] = x;
1801         }
1802
1803         if (!sd_id128_is_null(u->invocation_id)) {
1804                 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1805                         return -ENOMEM;
1806
1807                 our_env[n_env++] = x;
1808         }
1809
1810         if (exec_context_needs_term(c)) {
1811                 const char *tty_path, *term = NULL;
1812
1813                 tty_path = exec_context_tty_path(c);
1814
1815                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1816                  * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1817                  * container manager passes to PID 1 ends up all the way in the console login shown. */
1818
1819                 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
1820                         term = getenv("TERM");
1821
1822                 if (!term)
1823                         term = default_term_for_tty(tty_path);
1824
1825                 x = strjoin("TERM=", term);
1826                 if (!x)
1827                         return -ENOMEM;
1828                 our_env[n_env++] = x;
1829         }
1830
1831         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1832                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1833                         return -ENOMEM;
1834
1835                 our_env[n_env++] = x;
1836         }
1837
1838         if (c->log_namespace) {
1839                 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1840                 if (!x)
1841                         return -ENOMEM;
1842
1843                 our_env[n_env++] = x;
1844         }
1845
1846         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1847                 _cleanup_free_ char *pre = NULL, *joined = NULL;
1848                 const char *n;
1849
1850                 if (!p->prefix[t])
1851                         continue;
1852
1853                 if (strv_isempty(c->directories[t].paths))
1854                         continue;
1855
1856                 n = exec_directory_env_name_to_string(t);
1857                 if (!n)
1858                         continue;
1859
1860                 pre = strjoin(p->prefix[t], "/");
1861                 if (!pre)
1862                         return -ENOMEM;
1863
1864                 joined = strv_join_prefix(c->directories[t].paths, ":", pre);
1865                 if (!joined)
1866                         return -ENOMEM;
1867
1868                 x = strjoin(n, "=", joined);
1869                 if (!x)
1870                         return -ENOMEM;
1871
1872                 our_env[n_env++] = x;
1873         }
1874
1875         our_env[n_env++] = NULL;
1876         assert(n_env <= 14 + _EXEC_DIRECTORY_TYPE_MAX);
1877
1878         *ret = TAKE_PTR(our_env);
1879
1880         return 0;
1881 }
1882
1883 static int build_pass_environment(const ExecContext *c, char ***ret) {
1884         _cleanup_strv_free_ char **pass_env = NULL;
1885         size_t n_env = 0, n_bufsize = 0;
1886         char **i;
1887
1888         STRV_FOREACH(i, c->pass_environment) {
1889                 _cleanup_free_ char *x = NULL;
1890                 char *v;
1891
1892                 v = getenv(*i);
1893                 if (!v)
1894                         continue;
1895                 x = strjoin(*i, "=", v);
1896                 if (!x)
1897                         return -ENOMEM;
1898
1899                 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1900                         return -ENOMEM;
1901
1902                 pass_env[n_env++] = TAKE_PTR(x);
1903                 pass_env[n_env] = NULL;
1904         }
1905
1906         *ret = TAKE_PTR(pass_env);
1907
1908         return 0;
1909 }
1910
1911 static bool exec_needs_mount_namespace(
1912                 const ExecContext *context,
1913                 const ExecParameters *params,
1914                 const ExecRuntime *runtime) {
1915
1916         assert(context);
1917         assert(params);
1918
1919         if (context->root_image)
1920                 return true;
1921
1922         if (!strv_isempty(context->read_write_paths) ||
1923             !strv_isempty(context->read_only_paths) ||
1924             !strv_isempty(context->inaccessible_paths))
1925                 return true;
1926
1927         if (context->n_bind_mounts > 0)
1928                 return true;
1929
1930         if (context->n_temporary_filesystems > 0)
1931                 return true;
1932
1933         if (!IN_SET(context->mount_flags, 0, MS_SHARED))
1934                 return true;
1935
1936         if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
1937                 return true;
1938
1939         if (context->private_devices ||
1940             context->private_mounts ||
1941             context->protect_system != PROTECT_SYSTEM_NO ||
1942             context->protect_home != PROTECT_HOME_NO ||
1943             context->protect_kernel_tunables ||
1944             context->protect_kernel_modules ||
1945             context->protect_kernel_logs ||
1946             context->protect_control_groups)
1947                 return true;
1948
1949         if (context->root_directory) {
1950                 ExecDirectoryType t;
1951
1952                 if (context->mount_apivfs)
1953                         return true;
1954
1955                 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1956                         if (!params->prefix[t])
1957                                 continue;
1958
1959                         if (!strv_isempty(context->directories[t].paths))
1960                                 return true;
1961                 }
1962         }
1963
1964         if (context->dynamic_user &&
1965             (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
1966              !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
1967              !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
1968                 return true;
1969
1970         if (context->log_namespace)
1971                 return true;
1972
1973         return false;
1974 }
1975
1976 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
1977         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1978         _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
1979         _cleanup_close_ int unshare_ready_fd = -1;
1980         _cleanup_(sigkill_waitp) pid_t pid = 0;
1981         uint64_t c = 1;
1982         ssize_t n;
1983         int r;
1984
1985         /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
1986          * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
1987          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1988          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1989          * which waits for the parent to create the new user namespace while staying in the original namespace. The
1990          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1991          * continues execution normally.
1992          * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
1993          * does not need CAP_SETUID to write the single line mapping to itself. */
1994
1995         /* Can only set up multiple mappings with CAP_SETUID. */
1996         if (have_effective_cap(CAP_SETUID) && uid != ouid && uid_is_valid(uid))
1997                 r = asprintf(&uid_map,
1998                              UID_FMT " " UID_FMT " 1\n"     /* Map $OUID → $OUID */
1999                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
2000                              ouid, ouid, uid, uid);
2001         else
2002                 r = asprintf(&uid_map,
2003                              UID_FMT " " UID_FMT " 1\n",    /* Map $OUID → $OUID */
2004                              ouid, ouid);
2005
2006         if (r < 0)
2007                 return -ENOMEM;
2008
2009         /* Can only set up multiple mappings with CAP_SETGID. */
2010         if (have_effective_cap(CAP_SETGID) && gid != ogid && gid_is_valid(gid))
2011                 r = asprintf(&gid_map,
2012                              GID_FMT " " GID_FMT " 1\n"     /* Map $OGID → $OGID */
2013                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
2014                              ogid, ogid, gid, gid);
2015         else
2016                 r = asprintf(&gid_map,
2017                              GID_FMT " " GID_FMT " 1\n",    /* Map $OGID -> $OGID */
2018                              ogid, ogid);
2019
2020         if (r < 0)
2021                 return -ENOMEM;
2022
2023         /* Create a communication channel so that the parent can tell the child when it finished creating the user
2024          * namespace. */
2025         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2026         if (unshare_ready_fd < 0)
2027                 return -errno;
2028
2029         /* Create a communication channel so that the child can tell the parent a proper error code in case it
2030          * failed. */
2031         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2032                 return -errno;
2033
2034         r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2035         if (r < 0)
2036                 return r;
2037         if (r == 0) {
2038                 _cleanup_close_ int fd = -1;
2039                 const char *a;
2040                 pid_t ppid;
2041
2042                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2043                  * here, after the parent opened its own user namespace. */
2044
2045                 ppid = getppid();
2046                 errno_pipe[0] = safe_close(errno_pipe[0]);
2047
2048                 /* Wait until the parent unshared the user namespace */
2049                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2050                         r = -errno;
2051                         goto child_fail;
2052                 }
2053
2054                 /* Disable the setgroups() system call in the child user namespace, for good. */
2055                 a = procfs_file_alloca(ppid, "setgroups");
2056                 fd = open(a, O_WRONLY|O_CLOEXEC);
2057                 if (fd < 0) {
2058                         if (errno != ENOENT) {
2059                                 r = -errno;
2060                                 goto child_fail;
2061                         }
2062
2063                         /* If the file is missing the kernel is too old, let's continue anyway. */
2064                 } else {
2065                         if (write(fd, "deny\n", 5) < 0) {
2066                                 r = -errno;
2067                                 goto child_fail;
2068                         }
2069
2070                         fd = safe_close(fd);
2071                 }
2072
2073                 /* First write the GID map */
2074                 a = procfs_file_alloca(ppid, "gid_map");
2075                 fd = open(a, O_WRONLY|O_CLOEXEC);
2076                 if (fd < 0) {
2077                         r = -errno;
2078                         goto child_fail;
2079                 }
2080                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2081                         r = -errno;
2082                         goto child_fail;
2083                 }
2084                 fd = safe_close(fd);
2085
2086                 /* The write the UID map */
2087                 a = procfs_file_alloca(ppid, "uid_map");
2088                 fd = open(a, O_WRONLY|O_CLOEXEC);
2089                 if (fd < 0) {
2090                         r = -errno;
2091                         goto child_fail;
2092                 }
2093                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2094                         r = -errno;
2095                         goto child_fail;
2096                 }
2097
2098                 _exit(EXIT_SUCCESS);
2099
2100         child_fail:
2101                 (void) write(errno_pipe[1], &r, sizeof(r));
2102                 _exit(EXIT_FAILURE);
2103         }
2104
2105         errno_pipe[1] = safe_close(errno_pipe[1]);
2106
2107         if (unshare(CLONE_NEWUSER) < 0)
2108                 return -errno;
2109
2110         /* Let the child know that the namespace is ready now */
2111         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2112                 return -errno;
2113
2114         /* Try to read an error code from the child */
2115         n = read(errno_pipe[0], &r, sizeof(r));
2116         if (n < 0)
2117                 return -errno;
2118         if (n == sizeof(r)) { /* an error code was sent to us */
2119                 if (r < 0)
2120                         return r;
2121                 return -EIO;
2122         }
2123         if (n != 0) /* on success we should have read 0 bytes */
2124                 return -EIO;
2125
2126         r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
2127         pid = 0;
2128         if (r < 0)
2129                 return r;
2130         if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2131                 return -EIO;
2132
2133         return 0;
2134 }
2135
2136 static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2137         if (!context->dynamic_user)
2138                 return false;
2139
2140         if (type == EXEC_DIRECTORY_CONFIGURATION)
2141                 return false;
2142
2143         if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2144                 return false;
2145
2146         return true;
2147 }
2148
2149 static int setup_exec_directory(
2150                 const ExecContext *context,
2151                 const ExecParameters *params,
2152                 uid_t uid,
2153                 gid_t gid,
2154                 ExecDirectoryType type,
2155                 int *exit_status) {
2156
2157         static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2158                 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2159                 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2160                 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2161                 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2162                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2163         };
2164         char **rt;
2165         int r;
2166
2167         assert(context);
2168         assert(params);
2169         assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2170         assert(exit_status);
2171
2172         if (!params->prefix[type])
2173                 return 0;
2174
2175         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2176                 if (!uid_is_valid(uid))
2177                         uid = 0;
2178                 if (!gid_is_valid(gid))
2179                         gid = 0;
2180         }
2181
2182         STRV_FOREACH(rt, context->directories[type].paths) {
2183                 _cleanup_free_ char *p = NULL, *pp = NULL;
2184
2185                 p = path_join(params->prefix[type], *rt);
2186                 if (!p) {
2187                         r = -ENOMEM;
2188                         goto fail;
2189                 }
2190
2191                 r = mkdir_parents_label(p, 0755);
2192                 if (r < 0)
2193                         goto fail;
2194
2195                 if (exec_directory_is_private(context, type)) {
2196                         _cleanup_free_ char *private_root = NULL;
2197
2198                         /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2199                          * case we want to avoid leaving a directory around fully accessible that is owned by
2200                          * a dynamic user whose UID is later on reused. To lock this down we use the same
2201                          * trick used by container managers to prohibit host users to get access to files of
2202                          * the same UID in containers: we place everything inside a directory that has an
2203                          * access mode of 0700 and is owned root:root, so that it acts as security boundary
2204                          * for unprivileged host code. We then use fs namespacing to make this directory
2205                          * permeable for the service itself.
2206                          *
2207                          * Specifically: for a service which wants a special directory "foo/" we first create
2208                          * a directory "private/" with access mode 0700 owned by root:root. Then we place
2209                          * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2210                          * "private/foo". This way, privileged host users can access "foo/" as usual, but
2211                          * unprivileged host users can't look into it. Inside of the namespace of the unit
2212                          * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2213                          * "private/foo/" is mounted under the same name, thus disabling the access boundary
2214                          * for the service and making sure it only gets access to the dirs it needs but no
2215                          * others. Tricky? Yes, absolutely, but it works!
2216                          *
2217                          * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2218                          * to be owned by the service itself.
2219                          *
2220                          * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2221                          * for sharing files or sockets with other services. */
2222
2223                         private_root = path_join(params->prefix[type], "private");
2224                         if (!private_root) {
2225                                 r = -ENOMEM;
2226                                 goto fail;
2227                         }
2228
2229                         /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2230                         r = mkdir_safe_label(private_root, 0700, 0, 0, MKDIR_WARN_MODE);
2231                         if (r < 0)
2232                                 goto fail;
2233
2234                         pp = path_join(private_root, *rt);
2235                         if (!pp) {
2236                                 r = -ENOMEM;
2237                                 goto fail;
2238                         }
2239
2240                         /* Create all directories between the configured directory and this private root, and mark them 0755 */
2241                         r = mkdir_parents_label(pp, 0755);
2242                         if (r < 0)
2243                                 goto fail;
2244
2245                         if (is_dir(p, false) > 0 &&
2246                             (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2247
2248                                 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2249                                  * it over. Most likely the service has been upgraded from one that didn't use
2250                                  * DynamicUser=1, to one that does. */
2251
2252                                 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2253                                          "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2254                                          exec_directory_type_to_string(type), p, pp);
2255
2256                                 if (rename(p, pp) < 0) {
2257                                         r = -errno;
2258                                         goto fail;
2259                                 }
2260                         } else {
2261                                 /* Otherwise, create the actual directory for the service */
2262
2263                                 r = mkdir_label(pp, context->directories[type].mode);
2264                                 if (r < 0 && r != -EEXIST)
2265                                         goto fail;
2266                         }
2267
2268                         /* And link it up from the original place */
2269                         r = symlink_idempotent(pp, p, true);
2270                         if (r < 0)
2271                                 goto fail;
2272
2273                 } else {
2274                         _cleanup_free_ char *target = NULL;
2275
2276                         if (type != EXEC_DIRECTORY_CONFIGURATION &&
2277                             readlink_and_make_absolute(p, &target) >= 0) {
2278                                 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2279
2280                                 /* This already exists and is a symlink? Interesting. Maybe it's one created
2281                                  * by DynamicUser=1 (see above)?
2282                                  *
2283                                  * We do this for all directory types except for ConfigurationDirectory=,
2284                                  * since they all support the private/ symlink logic at least in some
2285                                  * configurations, see above. */
2286
2287                                 r = chase_symlinks(target, NULL, 0, &target_resolved, NULL);
2288                                 if (r < 0)
2289                                         goto fail;
2290
2291                                 q = path_join(params->prefix[type], "private", *rt);
2292                                 if (!q) {
2293                                         r = -ENOMEM;
2294                                         goto fail;
2295                                 }
2296
2297                                 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2298                                 r = chase_symlinks(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2299                                 if (r < 0)
2300                                         goto fail;
2301
2302                                 if (path_equal(q_resolved, target_resolved)) {
2303
2304                                         /* Hmm, apparently DynamicUser= was once turned on for this service,
2305                                          * but is no longer. Let's move the directory back up. */
2306
2307                                         log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2308                                                  "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2309                                                  exec_directory_type_to_string(type), q, p);
2310
2311                                         if (unlink(p) < 0) {
2312                                                 r = -errno;
2313                                                 goto fail;
2314                                         }
2315
2316                                         if (rename(q, p) < 0) {
2317                                                 r = -errno;
2318                                                 goto fail;
2319                                         }
2320                                 }
2321                         }
2322
2323                         r = mkdir_label(p, context->directories[type].mode);
2324                         if (r < 0) {
2325                                 if (r != -EEXIST)
2326                                         goto fail;
2327
2328                                 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2329                                         struct stat st;
2330
2331                                         /* Don't change the owner/access mode of the configuration directory,
2332                                          * as in the common case it is not written to by a service, and shall
2333                                          * not be writable. */
2334
2335                                         if (stat(p, &st) < 0) {
2336                                                 r = -errno;
2337                                                 goto fail;
2338                                         }
2339
2340                                         /* Still complain if the access mode doesn't match */
2341                                         if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2342                                                 log_warning("%s \'%s\' already exists but the mode is different. "
2343                                                             "(File system: %o %sMode: %o)",
2344                                                             exec_directory_type_to_string(type), *rt,
2345                                                             st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2346
2347                                         continue;
2348                                 }
2349                         }
2350                 }
2351
2352                 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2353                  * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2354                  * current UID/GID ownership.) */
2355                 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2356                 if (r < 0)
2357                         goto fail;
2358
2359                 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2360                  * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2361                  * assignments to exist.*/
2362                 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
2363                 if (r < 0)
2364                         goto fail;
2365         }
2366
2367         return 0;
2368
2369 fail:
2370         *exit_status = exit_status_table[type];
2371         return r;
2372 }
2373
2374 #if ENABLE_SMACK
2375 static int setup_smack(
2376                 const ExecContext *context,
2377                 const ExecCommand *command) {
2378
2379         int r;
2380
2381         assert(context);
2382         assert(command);
2383
2384         if (context->smack_process_label) {
2385                 r = mac_smack_apply_pid(0, context->smack_process_label);
2386                 if (r < 0)
2387                         return r;
2388         }
2389 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2390         else {
2391                 _cleanup_free_ char *exec_label = NULL;
2392
2393                 r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
2394                 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
2395                         return r;
2396
2397                 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2398                 if (r < 0)
2399                         return r;
2400         }
2401 #endif
2402
2403         return 0;
2404 }
2405 #endif
2406
2407 static int compile_bind_mounts(
2408                 const ExecContext *context,
2409                 const ExecParameters *params,
2410                 BindMount **ret_bind_mounts,
2411                 size_t *ret_n_bind_mounts,
2412                 char ***ret_empty_directories) {
2413
2414         _cleanup_strv_free_ char **empty_directories = NULL;
2415         BindMount *bind_mounts;
2416         size_t n, h = 0, i;
2417         ExecDirectoryType t;
2418         int r;
2419
2420         assert(context);
2421         assert(params);
2422         assert(ret_bind_mounts);
2423         assert(ret_n_bind_mounts);
2424         assert(ret_empty_directories);
2425
2426         n = context->n_bind_mounts;
2427         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2428                 if (!params->prefix[t])
2429                         continue;
2430
2431                 n += strv_length(context->directories[t].paths);
2432         }
2433
2434         if (n <= 0) {
2435                 *ret_bind_mounts = NULL;
2436                 *ret_n_bind_mounts = 0;
2437                 *ret_empty_directories = NULL;
2438                 return 0;
2439         }
2440
2441         bind_mounts = new(BindMount, n);
2442         if (!bind_mounts)
2443                 return -ENOMEM;
2444
2445         for (i = 0; i < context->n_bind_mounts; i++) {
2446                 BindMount *item = context->bind_mounts + i;
2447                 char *s, *d;
2448
2449                 s = strdup(item->source);
2450                 if (!s) {
2451                         r = -ENOMEM;
2452                         goto finish;
2453                 }
2454
2455                 d = strdup(item->destination);
2456                 if (!d) {
2457                         free(s);
2458                         r = -ENOMEM;
2459                         goto finish;
2460                 }
2461
2462                 bind_mounts[h++] = (BindMount) {
2463                         .source = s,
2464                         .destination = d,
2465                         .read_only = item->read_only,
2466                         .recursive = item->recursive,
2467                         .ignore_enoent = item->ignore_enoent,
2468                 };
2469         }
2470
2471         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2472                 char **suffix;
2473
2474                 if (!params->prefix[t])
2475                         continue;
2476
2477                 if (strv_isempty(context->directories[t].paths))
2478                         continue;
2479
2480                 if (exec_directory_is_private(context, t) &&
2481                     !(context->root_directory || context->root_image)) {
2482                         char *private_root;
2483
2484                         /* So this is for a dynamic user, and we need to make sure the process can access its own
2485                          * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2486                          * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2487
2488                         private_root = path_join(params->prefix[t], "private");
2489                         if (!private_root) {
2490                                 r = -ENOMEM;
2491                                 goto finish;
2492                         }
2493
2494                         r = strv_consume(&empty_directories, private_root);
2495                         if (r < 0)
2496                                 goto finish;
2497                 }
2498
2499                 STRV_FOREACH(suffix, context->directories[t].paths) {
2500                         char *s, *d;
2501
2502                         if (exec_directory_is_private(context, t))
2503                                 s = path_join(params->prefix[t], "private", *suffix);
2504                         else
2505                                 s = path_join(params->prefix[t], *suffix);
2506                         if (!s) {
2507                                 r = -ENOMEM;
2508                                 goto finish;
2509                         }
2510
2511                         if (exec_directory_is_private(context, t) &&
2512                             (context->root_directory || context->root_image))
2513                                 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2514                                  * directory is not created on the root directory. So, let's bind-mount the directory
2515                                  * on the 'non-private' place. */
2516                                 d = path_join(params->prefix[t], *suffix);
2517                         else
2518                                 d = strdup(s);
2519                         if (!d) {
2520                                 free(s);
2521                                 r = -ENOMEM;
2522                                 goto finish;
2523                         }
2524
2525                         bind_mounts[h++] = (BindMount) {
2526                                 .source = s,
2527                                 .destination = d,
2528                                 .read_only = false,
2529                                 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
2530                                 .recursive = true,
2531                                 .ignore_enoent = false,
2532                         };
2533                 }
2534         }
2535
2536         assert(h == n);
2537
2538         *ret_bind_mounts = bind_mounts;
2539         *ret_n_bind_mounts = n;
2540         *ret_empty_directories = TAKE_PTR(empty_directories);
2541
2542         return (int) n;
2543
2544 finish:
2545         bind_mount_free_many(bind_mounts, h);
2546         return r;
2547 }
2548
2549 static bool insist_on_sandboxing(
2550                 const ExecContext *context,
2551                 const char *root_dir,
2552                 const char *root_image,
2553                 const BindMount *bind_mounts,
2554                 size_t n_bind_mounts) {
2555
2556         size_t i;
2557
2558         assert(context);
2559         assert(n_bind_mounts == 0 || bind_mounts);
2560
2561         /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
2562          * would alter the view on the file system beyond making things read-only or invisible, i.e. would
2563          * rearrange stuff in a way we cannot ignore gracefully. */
2564
2565         if (context->n_temporary_filesystems > 0)
2566                 return true;
2567
2568         if (root_dir || root_image)
2569                 return true;
2570
2571         if (context->dynamic_user)
2572                 return true;
2573
2574         /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
2575          * essential. */
2576         for (i = 0; i < n_bind_mounts; i++)
2577                 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
2578                         return true;
2579
2580         if (context->log_namespace)
2581                 return true;
2582
2583         return false;
2584 }
2585
2586 static int apply_mount_namespace(
2587                 const Unit *u,
2588                 const ExecCommand *command,
2589                 const ExecContext *context,
2590                 const ExecParameters *params,
2591                 const ExecRuntime *runtime,
2592                 char **error_path) {
2593
2594         _cleanup_strv_free_ char **empty_directories = NULL;
2595         char *tmp = NULL, *var = NULL;
2596         const char *root_dir = NULL, *root_image = NULL;
2597         NamespaceInfo ns_info;
2598         bool needs_sandboxing;
2599         BindMount *bind_mounts = NULL;
2600         size_t n_bind_mounts = 0;
2601         int r;
2602
2603         assert(context);
2604
2605         if (params->flags & EXEC_APPLY_CHROOT) {
2606                 root_image = context->root_image;
2607
2608                 if (!root_image)
2609                         root_dir = context->root_directory;
2610         }
2611
2612         r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
2613         if (r < 0)
2614                 return r;
2615
2616         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2617         if (needs_sandboxing) {
2618                 /* The runtime struct only contains the parent of the private /tmp,
2619                  * which is non-accessible to world users. Inside of it there's a /tmp
2620                  * that is sticky, and that's the one we want to use here. */
2621
2622                 if (context->private_tmp && runtime) {
2623                         if (runtime->tmp_dir)
2624                                 tmp = strjoina(runtime->tmp_dir, "/tmp");
2625                         if (runtime->var_tmp_dir)
2626                                 var = strjoina(runtime->var_tmp_dir, "/tmp");
2627                 }
2628
2629                 ns_info = (NamespaceInfo) {
2630                         .ignore_protect_paths = false,
2631                         .private_dev = context->private_devices,
2632                         .protect_control_groups = context->protect_control_groups,
2633                         .protect_kernel_tunables = context->protect_kernel_tunables,
2634                         .protect_kernel_modules = context->protect_kernel_modules,
2635                         .protect_kernel_logs = context->protect_kernel_logs,
2636                         .protect_hostname = context->protect_hostname,
2637                         .mount_apivfs = context->mount_apivfs,
2638                         .private_mounts = context->private_mounts,
2639                 };
2640         } else if (!context->dynamic_user && root_dir)
2641                 /*
2642                  * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2643                  * sandbox info, otherwise enforce it, don't ignore protected paths and
2644                  * fail if we are enable to apply the sandbox inside the mount namespace.
2645                  */
2646                 ns_info = (NamespaceInfo) {
2647                         .ignore_protect_paths = true,
2648                 };
2649         else
2650                 ns_info = (NamespaceInfo) {};
2651
2652         if (context->mount_flags == MS_SHARED)
2653                 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
2654
2655         r = setup_namespace(root_dir, root_image,
2656                             &ns_info, context->read_write_paths,
2657                             needs_sandboxing ? context->read_only_paths : NULL,
2658                             needs_sandboxing ? context->inaccessible_paths : NULL,
2659                             empty_directories,
2660                             bind_mounts,
2661                             n_bind_mounts,
2662                             context->temporary_filesystems,
2663                             context->n_temporary_filesystems,
2664                             tmp,
2665                             var,
2666                             context->log_namespace,
2667                             needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
2668                             needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
2669                             context->mount_flags,
2670                             context->root_hash, context->root_hash_size, context->root_hash_path, context->root_verity,
2671                             DISSECT_IMAGE_DISCARD_ON_LOOP|DISSECT_IMAGE_RELAX_VAR_CHECK|DISSECT_IMAGE_FSCK,
2672                             error_path);
2673
2674         /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
2675          * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
2676          * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
2677          * completely different execution environment. */
2678         if (r == -ENOANO) {
2679                 if (insist_on_sandboxing(
2680                                     context,
2681                                     root_dir, root_image,
2682                                     bind_mounts,
2683                                     n_bind_mounts)) {
2684                         log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
2685                                        "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
2686                                        n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
2687
2688                         r = -EOPNOTSUPP;
2689                 } else {
2690                         log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
2691                         r = 0;
2692                 }
2693         }
2694
2695         bind_mount_free_many(bind_mounts, n_bind_mounts);
2696         return r;
2697 }
2698
2699 static int apply_working_directory(
2700                 const ExecContext *context,
2701                 const ExecParameters *params,
2702                 const char *home,
2703                 int *exit_status) {
2704
2705         const char *d, *wd;
2706
2707         assert(context);
2708         assert(exit_status);
2709
2710         if (context->working_directory_home) {
2711
2712                 if (!home) {
2713                         *exit_status = EXIT_CHDIR;
2714                         return -ENXIO;
2715                 }
2716
2717                 wd = home;
2718
2719         } else if (context->working_directory)
2720                 wd = context->working_directory;
2721         else
2722                 wd = "/";
2723
2724         if (params->flags & EXEC_APPLY_CHROOT)
2725                 d = wd;
2726         else
2727                 d = prefix_roota(context->root_directory, wd);
2728
2729         if (chdir(d) < 0 && !context->working_directory_missing_ok) {
2730                 *exit_status = EXIT_CHDIR;
2731                 return -errno;
2732         }
2733
2734         return 0;
2735 }
2736
2737 static int apply_root_directory(
2738                 const ExecContext *context,
2739                 const ExecParameters *params,
2740                 const bool needs_mount_ns,
2741                 int *exit_status) {
2742
2743         assert(context);
2744         assert(exit_status);
2745
2746         if (params->flags & EXEC_APPLY_CHROOT) {
2747                 if (!needs_mount_ns && context->root_directory)
2748                         if (chroot(context->root_directory) < 0) {
2749                                 *exit_status = EXIT_CHROOT;
2750                                 return -errno;
2751                         }
2752         }
2753
2754         return 0;
2755 }
2756
2757 static int setup_keyring(
2758                 const Unit *u,
2759                 const ExecContext *context,
2760                 const ExecParameters *p,
2761                 uid_t uid, gid_t gid) {
2762
2763         key_serial_t keyring;
2764         int r = 0;
2765         uid_t saved_uid;
2766         gid_t saved_gid;
2767
2768         assert(u);
2769         assert(context);
2770         assert(p);
2771
2772         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2773          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2774          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2775          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2776          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2777          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2778
2779         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
2780                 return 0;
2781
2782         /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
2783          * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
2784          * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
2785          * & group is just as nasty as acquiring a reference to the user keyring. */
2786
2787         saved_uid = getuid();
2788         saved_gid = getgid();
2789
2790         if (gid_is_valid(gid) && gid != saved_gid) {
2791                 if (setregid(gid, -1) < 0)
2792                         return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
2793         }
2794
2795         if (uid_is_valid(uid) && uid != saved_uid) {
2796                 if (setreuid(uid, -1) < 0) {
2797                         r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
2798                         goto out;
2799                 }
2800         }
2801
2802         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2803         if (keyring == -1) {
2804                 if (errno == ENOSYS)
2805                         log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
2806                 else if (IN_SET(errno, EACCES, EPERM))
2807                         log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
2808                 else if (errno == EDQUOT)
2809                         log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
2810                 else
2811                         r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
2812
2813                 goto out;
2814         }
2815
2816         /* When requested link the user keyring into the session keyring. */
2817         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
2818
2819                 if (keyctl(KEYCTL_LINK,
2820                            KEY_SPEC_USER_KEYRING,
2821                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
2822                         r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
2823                         goto out;
2824                 }
2825         }
2826
2827         /* Restore uid/gid back */
2828         if (uid_is_valid(uid) && uid != saved_uid) {
2829                 if (setreuid(saved_uid, -1) < 0) {
2830                         r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
2831                         goto out;
2832                 }
2833         }
2834
2835         if (gid_is_valid(gid) && gid != saved_gid) {
2836                 if (setregid(saved_gid, -1) < 0)
2837                         return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
2838         }
2839
2840         /* Populate they keyring with the invocation ID by default, as original saved_uid. */
2841         if (!sd_id128_is_null(u->invocation_id)) {
2842                 key_serial_t key;
2843
2844                 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
2845                 if (key == -1)
2846                         log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
2847                 else {
2848                         if (keyctl(KEYCTL_SETPERM, key,
2849                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
2850                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
2851                                 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
2852                 }
2853         }
2854
2855 out:
2856         /* Revert back uid & gid for the the last time, and exit */
2857         /* no extra logging, as only the first already reported error matters */
2858         if (getuid() != saved_uid)
2859                 (void) setreuid(saved_uid, -1);
2860
2861         if (getgid() != saved_gid)
2862                 (void) setregid(saved_gid, -1);
2863
2864         return r;
2865 }
2866
2867 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
2868         assert(array);
2869         assert(n);
2870         assert(pair);
2871
2872         if (pair[0] >= 0)
2873                 array[(*n)++] = pair[0];
2874         if (pair[1] >= 0)
2875                 array[(*n)++] = pair[1];
2876 }
2877
2878 static int close_remaining_fds(
2879                 const ExecParameters *params,
2880                 const ExecRuntime *runtime,
2881                 const DynamicCreds *dcreds,
2882                 int user_lookup_fd,
2883                 int socket_fd,
2884                 int exec_fd,
2885                 const int *fds, size_t n_fds) {
2886
2887         size_t n_dont_close = 0;
2888         int dont_close[n_fds + 12];
2889
2890         assert(params);
2891
2892         if (params->stdin_fd >= 0)
2893                 dont_close[n_dont_close++] = params->stdin_fd;
2894         if (params->stdout_fd >= 0)
2895                 dont_close[n_dont_close++] = params->stdout_fd;
2896         if (params->stderr_fd >= 0)
2897                 dont_close[n_dont_close++] = params->stderr_fd;
2898
2899         if (socket_fd >= 0)
2900                 dont_close[n_dont_close++] = socket_fd;
2901         if (exec_fd >= 0)
2902                 dont_close[n_dont_close++] = exec_fd;
2903         if (n_fds > 0) {
2904                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
2905                 n_dont_close += n_fds;
2906         }
2907
2908         if (runtime)
2909                 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
2910
2911         if (dcreds) {
2912                 if (dcreds->user)
2913                         append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
2914                 if (dcreds->group)
2915                         append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
2916         }
2917
2918         if (user_lookup_fd >= 0)
2919                 dont_close[n_dont_close++] = user_lookup_fd;
2920
2921         return close_all_fds(dont_close, n_dont_close);
2922 }
2923
2924 static int send_user_lookup(
2925                 Unit *unit,
2926                 int user_lookup_fd,
2927                 uid_t uid,
2928                 gid_t gid) {
2929
2930         assert(unit);
2931
2932         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2933          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2934          * specified. */
2935
2936         if (user_lookup_fd < 0)
2937                 return 0;
2938
2939         if (!uid_is_valid(uid) && !gid_is_valid(gid))
2940                 return 0;
2941
2942         if (writev(user_lookup_fd,
2943                (struct iovec[]) {
2944                            IOVEC_INIT(&uid, sizeof(uid)),
2945                            IOVEC_INIT(&gid, sizeof(gid)),
2946                            IOVEC_INIT_STRING(unit->id) }, 3) < 0)
2947                 return -errno;
2948
2949         return 0;
2950 }
2951
2952 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
2953         int r;
2954
2955         assert(c);
2956         assert(home);
2957         assert(buf);
2958
2959         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2960
2961         if (*home)
2962                 return 0;
2963
2964         if (!c->working_directory_home)
2965                 return 0;
2966
2967         r = get_home_dir(buf);
2968         if (r < 0)
2969                 return r;
2970
2971         *home = *buf;
2972         return 1;
2973 }
2974
2975 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
2976         _cleanup_strv_free_ char ** list = NULL;
2977         ExecDirectoryType t;
2978         int r;
2979
2980         assert(c);
2981         assert(p);
2982         assert(ret);
2983
2984         assert(c->dynamic_user);
2985
2986         /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2987          * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2988          * directories. */
2989
2990         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2991                 char **i;
2992
2993                 if (t == EXEC_DIRECTORY_CONFIGURATION)
2994                         continue;
2995
2996                 if (!p->prefix[t])
2997                         continue;
2998
2999                 STRV_FOREACH(i, c->directories[t].paths) {
3000                         char *e;
3001
3002                         if (exec_directory_is_private(c, t))
3003                                 e = path_join(p->prefix[t], "private", *i);
3004                         else
3005                                 e = path_join(p->prefix[t], *i);
3006                         if (!e)
3007                                 return -ENOMEM;
3008
3009                         r = strv_consume(&list, e);
3010                         if (r < 0)
3011                                 return r;
3012                 }
3013         }
3014
3015         *ret = TAKE_PTR(list);
3016
3017         return 0;
3018 }
3019
3020 static char *exec_command_line(char **argv);
3021
3022 static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
3023         bool using_subcgroup;
3024         char *p;
3025
3026         assert(params);
3027         assert(ret);
3028
3029         if (!params->cgroup_path)
3030                 return -EINVAL;
3031
3032         /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
3033          * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
3034          * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
3035          * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
3036          * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
3037          * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
3038          * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
3039          * flag, which is only passed for the former statements, not for the latter. */
3040
3041         using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
3042         if (using_subcgroup)
3043                 p = path_join(params->cgroup_path, ".control");
3044         else
3045                 p = strdup(params->cgroup_path);
3046         if (!p)
3047                 return -ENOMEM;
3048
3049         *ret = p;
3050         return using_subcgroup;
3051 }
3052
3053 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
3054         _cleanup_(cpu_set_reset) CPUSet s = {};
3055         int r;
3056
3057         assert(c);
3058         assert(ret);
3059
3060         if (!c->numa_policy.nodes.set) {
3061                 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3062                 return 0;
3063         }
3064
3065         r = numa_to_cpu_set(&c->numa_policy, &s);
3066         if (r < 0)
3067                 return r;
3068
3069         cpu_set_reset(ret);
3070
3071         return cpu_set_add_all(ret, &s);
3072 }
3073
3074 bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
3075         assert(c);
3076
3077         return c->cpu_affinity_from_numa;
3078 }
3079
3080 static int exec_child(
3081                 Unit *unit,
3082                 const ExecCommand *command,
3083                 const ExecContext *context,
3084                 const ExecParameters *params,
3085                 ExecRuntime *runtime,
3086                 DynamicCreds *dcreds,
3087                 int socket_fd,
3088                 const int named_iofds[static 3],
3089                 int *fds,
3090                 size_t n_socket_fds,
3091                 size_t n_storage_fds,
3092                 char **files_env,
3093                 int user_lookup_fd,
3094                 int *exit_status) {
3095
3096         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **replaced_argv = NULL;
3097         int *fds_with_exec_fd, n_fds_with_exec_fd, r, ngids = 0, exec_fd = -1;
3098         _cleanup_free_ gid_t *supplementary_gids = NULL;
3099         const char *username = NULL, *groupname = NULL;
3100         _cleanup_free_ char *home_buffer = NULL;
3101         const char *home = NULL, *shell = NULL;
3102         char **final_argv = NULL;
3103         dev_t journal_stream_dev = 0;
3104         ino_t journal_stream_ino = 0;
3105         bool userns_set_up = false;
3106         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
3107                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
3108                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
3109                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
3110 #if HAVE_SELINUX
3111         _cleanup_free_ char *mac_selinux_context_net = NULL;
3112         bool use_selinux = false;
3113 #endif
3114 #if ENABLE_SMACK
3115         bool use_smack = false;
3116 #endif
3117 #if HAVE_APPARMOR
3118         bool use_apparmor = false;
3119 #endif
3120         uid_t saved_uid = getuid();
3121         gid_t saved_gid = getgid();
3122         uid_t uid = UID_INVALID;
3123         gid_t gid = GID_INVALID;
3124         size_t n_fds;
3125         ExecDirectoryType dt;
3126         int secure_bits;
3127         _cleanup_free_ gid_t *gids_after_pam = NULL;
3128         int ngids_after_pam = 0;
3129
3130         assert(unit);
3131         assert(command);
3132         assert(context);
3133         assert(params);
3134         assert(exit_status);
3135
3136         rename_process_from_path(command->path);
3137
3138         /* We reset exactly these signals, since they are the
3139          * only ones we set to SIG_IGN in the main daemon. All
3140          * others we leave untouched because we set them to
3141          * SIG_DFL or a valid handler initially, both of which
3142          * will be demoted to SIG_DFL. */
3143         (void) default_signals(SIGNALS_CRASH_HANDLER,
3144                                SIGNALS_IGNORE, -1);
3145
3146         if (context->ignore_sigpipe)
3147                 (void) ignore_signals(SIGPIPE, -1);
3148
3149         r = reset_signal_mask();
3150         if (r < 0) {
3151                 *exit_status = EXIT_SIGNAL_MASK;
3152                 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
3153         }
3154
3155         if (params->idle_pipe)
3156                 do_idle_pipe_dance(params->idle_pipe);
3157
3158         /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
3159          * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
3160          * any fds open we don't really want open during the transition. In order to make logging work, we switch the
3161          * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
3162
3163         log_forget_fds();
3164         log_set_open_when_needed(true);
3165
3166         /* In case anything used libc syslog(), close this here, too */
3167         closelog();
3168
3169         n_fds = n_socket_fds + n_storage_fds;
3170         r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, params->exec_fd, fds, n_fds);
3171         if (r < 0) {
3172                 *exit_status = EXIT_FDS;
3173                 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
3174         }
3175
3176         if (!context->same_pgrp)
3177                 if (setsid() < 0) {
3178                         *exit_status = EXIT_SETSID;
3179                         return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
3180                 }
3181
3182         exec_context_tty_reset(context, params);
3183
3184         if (unit_shall_confirm_spawn(unit)) {
3185                 const char *vc = params->confirm_spawn;
3186                 _cleanup_free_ char *cmdline = NULL;
3187
3188                 cmdline = exec_command_line(command->argv);
3189                 if (!cmdline) {
3190                         *exit_status = EXIT_MEMORY;
3191                         return log_oom();
3192                 }
3193
3194                 r = ask_for_confirmation(vc, unit, cmdline);
3195                 if (r != CONFIRM_EXECUTE) {
3196                         if (r == CONFIRM_PRETEND_SUCCESS) {
3197                                 *exit_status = EXIT_SUCCESS;
3198                                 return 0;
3199                         }
3200                         *exit_status = EXIT_CONFIRM;
3201                         log_unit_error(unit, "Execution cancelled by the user");
3202                         return -ECANCELED;
3203                 }
3204         }
3205
3206         /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
3207          * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
3208          * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
3209          * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
3210          * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
3211         if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
3212             setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
3213                 *exit_status = EXIT_MEMORY;
3214                 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3215         }
3216
3217         if (context->dynamic_user && dcreds) {
3218                 _cleanup_strv_free_ char **suggested_paths = NULL;
3219
3220                 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
3221                  * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here.*/
3222                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
3223                         *exit_status = EXIT_USER;
3224                         return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3225                 }
3226
3227                 r = compile_suggested_paths(context, params, &suggested_paths);
3228                 if (r < 0) {
3229                         *exit_status = EXIT_MEMORY;
3230                         return log_oom();
3231                 }
3232
3233                 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
3234                 if (r < 0) {
3235                         *exit_status = EXIT_USER;
3236                         if (r == -EILSEQ) {
3237                                 log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists.");
3238                                 return -EOPNOTSUPP;
3239                         }
3240                         return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
3241                 }
3242
3243                 if (!uid_is_valid(uid)) {
3244                         *exit_status = EXIT_USER;
3245                         log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
3246                         return -ESRCH;
3247                 }
3248
3249                 if (!gid_is_valid(gid)) {
3250                         *exit_status = EXIT_USER;
3251                         log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
3252                         return -ESRCH;
3253                 }
3254
3255                 if (dcreds->user)
3256                         username = dcreds->user->name;
3257
3258         } else {
3259                 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
3260                 if (r < 0) {
3261                         *exit_status = EXIT_USER;
3262                         return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
3263                 }
3264
3265                 r = get_fixed_group(context, &groupname, &gid);
3266                 if (r < 0) {
3267                         *exit_status = EXIT_GROUP;
3268                         return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
3269                 }
3270         }
3271
3272         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
3273         r = get_supplementary_groups(context, username, groupname, gid,
3274                                      &supplementary_gids, &ngids);
3275         if (r < 0) {
3276                 *exit_status = EXIT_GROUP;
3277                 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
3278         }
3279
3280         r = send_user_lookup(unit, user_lookup_fd, uid, gid);
3281         if (r < 0) {
3282                 *exit_status = EXIT_USER;
3283                 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
3284         }
3285
3286         user_lookup_fd = safe_close(user_lookup_fd);
3287
3288         r = acquire_home(context, uid, &home, &home_buffer);
3289         if (r < 0) {
3290                 *exit_status = EXIT_CHDIR;
3291                 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
3292         }
3293
3294         /* If a socket is connected to STDIN/STDOUT/STDERR, we
3295          * must sure to drop O_NONBLOCK */
3296         if (socket_fd >= 0)
3297                 (void) fd_nonblock(socket_fd, false);
3298
3299         /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
3300          * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
3301         if (params->cgroup_path) {
3302                 _cleanup_free_ char *p = NULL;
3303
3304                 r = exec_parameters_get_cgroup_path(params, &p);
3305                 if (r < 0) {
3306                         *exit_status = EXIT_CGROUP;
3307                         return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
3308                 }
3309
3310                 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
3311                 if (r < 0) {
3312                         *exit_status = EXIT_CGROUP;
3313                         return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
3314                 }
3315         }
3316
3317         if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
3318                 r = open_netns_path(runtime->netns_storage_socket, context->network_namespace_path);
3319                 if (r < 0) {
3320                         *exit_status = EXIT_NETWORK;
3321                         return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
3322                 }
3323         }
3324
3325         r = setup_input(context, params, socket_fd, named_iofds);
3326         if (r < 0) {
3327                 *exit_status = EXIT_STDIN;
3328                 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
3329         }
3330
3331         r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3332         if (r < 0) {
3333                 *exit_status = EXIT_STDOUT;
3334                 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
3335         }
3336
3337         r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3338         if (r < 0) {
3339                 *exit_status = EXIT_STDERR;
3340                 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
3341         }
3342
3343         if (context->oom_score_adjust_set) {
3344                 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
3345                  * prohibit write access to this file, and we shouldn't trip up over that. */
3346                 r = set_oom_score_adjust(context->oom_score_adjust);
3347                 if (IN_SET(r, -EPERM, -EACCES))
3348                         log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
3349                 else if (r < 0) {
3350                         *exit_status = EXIT_OOM_ADJUST;
3351                         return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
3352                 }
3353         }
3354
3355         if (context->coredump_filter_set) {
3356                 r = set_coredump_filter(context->coredump_filter);
3357                 if (ERRNO_IS_PRIVILEGE(r))
3358                         log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
3359                 else if (r < 0)
3360                         return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
3361         }
3362
3363         if (context->nice_set) {
3364                 r = setpriority_closest(context->nice);
3365                 if (r < 0)
3366                         return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
3367         }
3368
3369         if (context->cpu_sched_set) {
3370                 struct sched_param param = {
3371                         .sched_priority = context->cpu_sched_priority,
3372                 };
3373
3374                 r = sched_setscheduler(0,
3375                                        context->cpu_sched_policy |
3376                                        (context->cpu_sched_reset_on_fork ?
3377                                         SCHED_RESET_ON_FORK : 0),
3378                                        &param);
3379                 if (r < 0) {
3380                         *exit_status = EXIT_SETSCHEDULER;
3381                         return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
3382                 }
3383         }
3384
3385         if (context->cpu_affinity_from_numa || context->cpu_set.set) {
3386                 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
3387                 const CPUSet *cpu_set;
3388
3389                 if (context->cpu_affinity_from_numa) {
3390                         r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
3391                         if (r < 0) {
3392                                 *exit_status = EXIT_CPUAFFINITY;
3393                                 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
3394                         }
3395
3396                         cpu_set = &converted_cpu_set;
3397                 } else
3398                         cpu_set = &context->cpu_set;
3399
3400                 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
3401                         *exit_status = EXIT_CPUAFFINITY;
3402                         return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
3403                 }
3404         }
3405
3406         if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
3407                 r = apply_numa_policy(&context->numa_policy);
3408                 if (r == -EOPNOTSUPP)
3409                         log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
3410                 else if (r < 0) {
3411                         *exit_status = EXIT_NUMA_POLICY;
3412                         return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
3413                 }
3414         }
3415
3416         if (context->ioprio_set)
3417                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
3418                         *exit_status = EXIT_IOPRIO;
3419                         return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
3420                 }
3421
3422         if (context->timer_slack_nsec != NSEC_INFINITY)
3423                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
3424                         *exit_status = EXIT_TIMERSLACK;
3425                         return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
3426                 }
3427
3428         if (context->personality != PERSONALITY_INVALID) {
3429                 r = safe_personality(context->personality);
3430                 if (r < 0) {
3431                         *exit_status = EXIT_PERSONALITY;
3432                         return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
3433                 }
3434         }
3435
3436         if (context->utmp_id)
3437                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
3438                                       context->tty_path,
3439                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
3440                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
3441                                       USER_PROCESS,
3442                                       username);
3443
3444         if (uid_is_valid(uid)) {
3445                 r = chown_terminal(STDIN_FILENO, uid);
3446                 if (r < 0) {
3447                         *exit_status = EXIT_STDIN;
3448                         return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
3449                 }
3450         }
3451
3452         /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
3453          * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
3454          * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
3455          * touch a single hierarchy too. */
3456         if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
3457                 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
3458                 if (r < 0) {
3459                         *exit_status = EXIT_CGROUP;
3460                         return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
3461                 }
3462         }
3463
3464         for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3465                 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
3466                 if (r < 0)
3467                         return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
3468         }
3469
3470         r = build_environment(
3471                         unit,
3472                         context,
3473                         params,
3474                         n_fds,
3475                         home,
3476                         username,
3477                         shell,
3478                         journal_stream_dev,
3479                         journal_stream_ino,
3480                         &our_env);
3481         if (r < 0) {
3482                 *exit_status = EXIT_MEMORY;
3483                 return log_oom();
3484         }
3485
3486         r = build_pass_environment(context, &pass_env);
3487         if (r < 0) {
3488                 *exit_status = EXIT_MEMORY;
3489                 return log_oom();
3490         }
3491
3492         accum_env = strv_env_merge(5,
3493                                    params->environment,
3494                                    our_env,
3495                                    pass_env,
3496                                    context->environment,
3497                                    files_env);
3498         if (!accum_env) {
3499                 *exit_status = EXIT_MEMORY;
3500                 return log_oom();
3501         }
3502         accum_env = strv_env_clean(accum_env);
3503
3504         (void) umask(context->umask);
3505
3506         r = setup_keyring(unit, context, params, uid, gid);
3507         if (r < 0) {
3508                 *exit_status = EXIT_KEYRING;
3509                 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
3510         }
3511
3512         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
3513         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3514
3515         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
3516         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
3517
3518         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
3519         if (needs_ambient_hack)
3520                 needs_setuid = false;
3521         else
3522                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
3523
3524         if (needs_sandboxing) {
3525                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3526                  * present. The actual MAC context application will happen later, as late as possible, to avoid
3527                  * impacting our own code paths. */
3528
3529 #if HAVE_SELINUX
3530                 use_selinux = mac_selinux_use();
3531 #endif
3532 #if ENABLE_SMACK
3533                 use_smack = mac_smack_use();
3534 #endif
3535 #if HAVE_APPARMOR
3536                 use_apparmor = mac_apparmor_use();
3537 #endif
3538         }
3539
3540         if (needs_sandboxing) {
3541                 int which_failed;
3542
3543                 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
3544                  * is set here. (See below.) */
3545
3546                 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
3547                 if (r < 0) {
3548                         *exit_status = EXIT_LIMITS;
3549                         return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3550                 }
3551         }
3552
3553         if (needs_setuid) {
3554
3555                 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
3556                  * wins here. (See above.) */
3557
3558                 if (context->pam_name && username) {
3559                         r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
3560                         if (r < 0) {
3561                                 *exit_status = EXIT_PAM;
3562                                 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
3563                         }
3564
3565                         ngids_after_pam = getgroups_alloc(&gids_after_pam);
3566                         if (ngids_after_pam < 0) {
3567                                 *exit_status = EXIT_MEMORY;
3568                                 return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
3569                         }
3570                 }
3571         }
3572
3573         if (needs_sandboxing) {
3574 #if HAVE_SELINUX
3575                 if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
3576                         r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
3577                         if (r < 0) {
3578                                 *exit_status = EXIT_SELINUX_CONTEXT;
3579                                 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
3580                         }
3581                 }
3582 #endif
3583
3584                 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
3585                  * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
3586                  * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
3587                 if (context->private_users && !have_effective_cap(CAP_SYS_ADMIN)) {
3588                         userns_set_up = true;
3589                         r = setup_private_users(saved_uid, saved_gid, uid, gid);
3590                         if (r < 0) {
3591                                 *exit_status = EXIT_USER;
3592                                 return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
3593                         }
3594                 }
3595         }
3596
3597         if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
3598
3599                 if (ns_type_supported(NAMESPACE_NET)) {
3600                         r = setup_netns(runtime->netns_storage_socket);
3601                         if (r == -EPERM)
3602                                 log_unit_warning_errno(unit, r,
3603                                                        "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
3604                         else if (r < 0) {
3605                                 *exit_status = EXIT_NETWORK;
3606                                 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
3607                         }
3608                 } else if (context->network_namespace_path) {
3609                         *exit_status = EXIT_NETWORK;
3610                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
3611                                                     "NetworkNamespacePath= is not supported, refusing.");
3612                 } else
3613                         log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3614         }
3615
3616         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
3617         if (needs_mount_namespace) {
3618                 _cleanup_free_ char *error_path = NULL;
3619
3620                 r = apply_mount_namespace(unit, command, context, params, runtime, &error_path);
3621                 if (r < 0) {
3622                         *exit_status = EXIT_NAMESPACE;
3623                         return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
3624                                                     error_path ? ": " : "", strempty(error_path));
3625                 }
3626         }
3627
3628         if (needs_sandboxing) {
3629                 r = apply_protect_hostname(unit, context, exit_status);
3630                 if (r < 0)
3631                         return r;
3632         }
3633
3634         /* Drop groups as early as possible.
3635          * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
3636          * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
3637         if (needs_setuid) {
3638                 _cleanup_free_ gid_t *gids_to_enforce = NULL;
3639                 int ngids_to_enforce = 0;
3640
3641                 ngids_to_enforce = merge_gid_lists(supplementary_gids,
3642                                                    ngids,
3643                                                    gids_after_pam,
3644                                                    ngids_after_pam,
3645                                                    &gids_to_enforce);
3646                 if (ngids_to_enforce < 0) {
3647                         *exit_status = EXIT_MEMORY;
3648                         return log_unit_error_errno(unit,
3649                                                     ngids_to_enforce,
3650                                                     "Failed to merge group lists. Group membership might be incorrect: %m");
3651                 }
3652
3653                 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
3654                 if (r < 0) {
3655                         *exit_status = EXIT_GROUP;
3656                         return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
3657                 }
3658         }
3659
3660         /* If the user namespace was not set up above, try to do it now.
3661          * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
3662          * restricted by rules pertaining to combining user namspaces with other namespaces (e.g. in the
3663          * case of mount namespaces being less privileged when the mount point list is copied from a
3664          * different user namespace). */
3665
3666         if (needs_sandboxing && context->private_users && !userns_set_up) {
3667                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
3668                 if (r < 0) {
3669                         *exit_status = EXIT_USER;
3670                         return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
3671                 }
3672         }
3673
3674         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3675          * more aggressive this time since socket_fd and the netns fds we don't need anymore. We do keep the exec_fd
3676          * however if we have it as we want to keep it open until the final execve(). */
3677
3678         if (params->exec_fd >= 0) {
3679                 exec_fd = params->exec_fd;
3680
3681                 if (exec_fd < 3 + (int) n_fds) {
3682                         int moved_fd;
3683
3684                         /* Let's move the exec fd far up, so that it's outside of the fd range we want to pass to the
3685                          * process we are about to execute. */
3686
3687                         moved_fd = fcntl(exec_fd, F_DUPFD_CLOEXEC, 3 + (int) n_fds);
3688                         if (moved_fd < 0) {
3689                                 *exit_status = EXIT_FDS;
3690                                 return log_unit_error_errno(unit, errno, "Couldn't move exec fd up: %m");
3691                         }
3692
3693                         safe_close(exec_fd);
3694                         exec_fd = moved_fd;
3695                 } else {
3696                         /* This fd should be FD_CLOEXEC already, but let's make sure. */
3697                         r = fd_cloexec(exec_fd, true);
3698                         if (r < 0) {
3699                                 *exit_status = EXIT_FDS;
3700                                 return log_unit_error_errno(unit, r, "Failed to make exec fd FD_CLOEXEC: %m");
3701                         }
3702                 }
3703
3704                 fds_with_exec_fd = newa(int, n_fds + 1);
3705                 memcpy_safe(fds_with_exec_fd, fds, n_fds * sizeof(int));
3706                 fds_with_exec_fd[n_fds] = exec_fd;
3707                 n_fds_with_exec_fd = n_fds + 1;
3708         } else {
3709                 fds_with_exec_fd = fds;
3710                 n_fds_with_exec_fd = n_fds;
3711         }
3712
3713         r = close_all_fds(fds_with_exec_fd, n_fds_with_exec_fd);
3714         if (r >= 0)
3715                 r = shift_fds(fds, n_fds);
3716         if (r >= 0)
3717                 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
3718         if (r < 0) {
3719                 *exit_status = EXIT_FDS;
3720                 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
3721         }
3722
3723         /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
3724          * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
3725          * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
3726          * came this far. */
3727
3728         secure_bits = context->secure_bits;
3729
3730         if (needs_sandboxing) {
3731                 uint64_t bset;
3732
3733                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
3734                  * requested. (Note this is placed after the general resource limit initialization, see
3735                  * above, in order to take precedence.) */
3736                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
3737                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
3738                                 *exit_status = EXIT_LIMITS;
3739                                 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3740                         }
3741                 }
3742
3743 #if ENABLE_SMACK
3744                 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
3745                  * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
3746                 if (use_smack) {
3747                         r = setup_smack(context, command);
3748                         if (r < 0) {
3749                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
3750                                 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
3751                         }
3752                 }
3753 #endif
3754
3755                 bset = context->capability_bounding_set;
3756                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3757                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3758                  * instead of us doing that */
3759                 if (needs_ambient_hack)
3760                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
3761                                 (UINT64_C(1) << CAP_SETUID) |
3762                                 (UINT64_C(1) << CAP_SETGID);
3763
3764                 if (!cap_test_all(bset)) {
3765                         r = capability_bounding_set_drop(bset, false);
3766                         if (r < 0) {
3767                                 *exit_status = EXIT_CAPABILITIES;
3768                                 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3769                         }
3770                 }
3771
3772                 /* This is done before enforce_user, but ambient set
3773                  * does not survive over setresuid() if keep_caps is not set. */
3774                 if (!needs_ambient_hack) {
3775                         r = capability_ambient_set_apply(context->capability_ambient_set, true);
3776                         if (r < 0) {
3777                                 *exit_status = EXIT_CAPABILITIES;
3778                                 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
3779                         }
3780                 }
3781         }
3782
3783         /* chroot to root directory first, before we lose the ability to chroot */
3784         r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
3785         if (r < 0)
3786                 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
3787
3788         if (needs_setuid) {
3789                 if (uid_is_valid(uid)) {
3790                         r = enforce_user(context, uid);
3791                         if (r < 0) {
3792                                 *exit_status = EXIT_USER;
3793                                 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
3794                         }
3795
3796                         if (!needs_ambient_hack &&
3797                             context->capability_ambient_set != 0) {
3798
3799                                 /* Fix the ambient capabilities after user change. */
3800                                 r = capability_ambient_set_apply(context->capability_ambient_set, false);
3801                                 if (r < 0) {
3802                                         *exit_status = EXIT_CAPABILITIES;
3803                                         return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
3804                                 }
3805
3806                                 /* If we were asked to change user and ambient capabilities
3807                                  * were requested, we had to add keep-caps to the securebits
3808                                  * so that we would maintain the inherited capability set
3809                                  * through the setresuid(). Make sure that the bit is added
3810                                  * also to the context secure_bits so that we don't try to
3811                                  * drop the bit away next. */
3812
3813                                 secure_bits |= 1<<SECURE_KEEP_CAPS;
3814                         }
3815                 }
3816         }
3817
3818         /* Apply working directory here, because the working directory might be on NFS and only the user running
3819          * this service might have the correct privilege to change to the working directory */
3820         r = apply_working_directory(context, params, home, exit_status);
3821         if (r < 0)
3822                 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
3823
3824         if (needs_sandboxing) {
3825                 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3826                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3827                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3828                  * are restricted. */
3829
3830 #if HAVE_SELINUX
3831                 if (use_selinux) {
3832                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
3833
3834                         if (exec_context) {
3835                                 r = setexeccon(exec_context);
3836                                 if (r < 0) {
3837                                         *exit_status = EXIT_SELINUX_CONTEXT;
3838                                         return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
3839                                 }
3840                         }
3841                 }
3842 #endif
3843
3844 #if HAVE_APPARMOR
3845                 if (use_apparmor && context->apparmor_profile) {
3846                         r = aa_change_onexec(context->apparmor_profile);
3847                         if (r < 0 && !context->apparmor_profile_ignore) {
3848                                 *exit_status = EXIT_APPARMOR_PROFILE;
3849                                 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
3850                         }
3851                 }
3852 #endif
3853
3854                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3855                  * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3856                 if (prctl(PR_GET_SECUREBITS) != secure_bits)
3857                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
3858                                 *exit_status = EXIT_SECUREBITS;
3859                                 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
3860                         }
3861
3862                 if (context_has_no_new_privileges(context))
3863                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
3864                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
3865                                 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
3866                         }
3867
3868 #if HAVE_SECCOMP
3869                 r = apply_address_families(unit, context);
3870                 if (r < 0) {
3871                         *exit_status = EXIT_ADDRESS_FAMILIES;
3872                         return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
3873                 }
3874
3875                 r = apply_memory_deny_write_execute(unit, context);
3876                 if (r < 0) {
3877                         *exit_status = EXIT_SECCOMP;
3878                         return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
3879                 }
3880
3881                 r = apply_restrict_realtime(unit, context);
3882                 if (r < 0) {
3883                         *exit_status = EXIT_SECCOMP;
3884                         return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
3885                 }
3886
3887                 r = apply_restrict_suid_sgid(unit, context);
3888                 if (r < 0) {
3889                         *exit_status = EXIT_SECCOMP;
3890                         return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
3891                 }
3892
3893                 r = apply_restrict_namespaces(unit, context);
3894                 if (r < 0) {
3895                         *exit_status = EXIT_SECCOMP;
3896                         return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
3897                 }
3898
3899                 r = apply_protect_sysctl(unit, context);
3900                 if (r < 0) {
3901                         *exit_status = EXIT_SECCOMP;
3902                         return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
3903                 }
3904
3905                 r = apply_protect_kernel_modules(unit, context);
3906                 if (r < 0) {
3907                         *exit_status = EXIT_SECCOMP;
3908                         return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
3909                 }
3910
3911                 r = apply_protect_kernel_logs(unit, context);
3912                 if (r < 0) {
3913                         *exit_status = EXIT_SECCOMP;
3914                         return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
3915                 }
3916
3917                 r = apply_protect_clock(unit, context);
3918                 if (r < 0) {
3919                         *exit_status = EXIT_SECCOMP;
3920                         return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
3921                 }
3922
3923                 r = apply_private_devices(unit, context);
3924                 if (r < 0) {
3925                         *exit_status = EXIT_SECCOMP;
3926                         return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
3927                 }
3928
3929                 r = apply_syscall_archs(unit, context);
3930                 if (r < 0) {
3931                         *exit_status = EXIT_SECCOMP;
3932                         return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
3933                 }
3934
3935                 r = apply_lock_personality(unit, context);
3936                 if (r < 0) {
3937                         *exit_status = EXIT_SECCOMP;
3938                         return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
3939                 }
3940
3941                 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3942                  * by the filter as little as possible. */
3943                 r = apply_syscall_filter(unit, context, needs_ambient_hack);
3944                 if (r < 0) {
3945                         *exit_status = EXIT_SECCOMP;
3946                         return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
3947                 }
3948 #endif
3949         }
3950
3951         if (!strv_isempty(context->unset_environment)) {
3952                 char **ee = NULL;
3953
3954                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
3955                 if (!ee) {
3956                         *exit_status = EXIT_MEMORY;
3957                         return log_oom();
3958                 }
3959
3960                 strv_free_and_replace(accum_env, ee);
3961         }
3962
3963         if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
3964                 replaced_argv = replace_env_argv(command->argv, accum_env);
3965                 if (!replaced_argv) {
3966                         *exit_status = EXIT_MEMORY;
3967                         return log_oom();
3968                 }
3969                 final_argv = replaced_argv;
3970         } else
3971                 final_argv = command->argv;
3972
3973         if (DEBUG_LOGGING) {
3974                 _cleanup_free_ char *line;
3975
3976                 line = exec_command_line(final_argv);
3977                 if (line)
3978                         log_struct(LOG_DEBUG,
3979                                    "EXECUTABLE=%s", command->path,
3980                                    LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
3981                                    LOG_UNIT_ID(unit),
3982                                    LOG_UNIT_INVOCATION_ID(unit));
3983         }
3984
3985         if (exec_fd >= 0) {
3986                 uint8_t hot = 1;
3987
3988                 /* We have finished with all our initializations. Let's now let the manager know that. From this point
3989                  * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
3990
3991                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
3992                         *exit_status = EXIT_EXEC;
3993                         return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
3994                 }
3995         }
3996
3997         execve(command->path, final_argv, accum_env);
3998         r = -errno;
3999
4000         if (exec_fd >= 0) {
4001                 uint8_t hot = 0;
4002
4003                 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
4004                  * that POLLHUP on it no longer means execve() succeeded. */
4005
4006                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
4007                         *exit_status = EXIT_EXEC;
4008                         return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
4009                 }
4010         }
4011
4012         if (r == -ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
4013                 log_struct_errno(LOG_INFO, r,
4014                                  "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4015                                  LOG_UNIT_ID(unit),
4016                                  LOG_UNIT_INVOCATION_ID(unit),
4017                                  LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
4018                                                   command->path),
4019                                  "EXECUTABLE=%s", command->path);
4020                 return 0;
4021         }
4022
4023         *exit_status = EXIT_EXEC;
4024         return log_unit_error_errno(unit, r, "Failed to execute command: %m");
4025 }
4026
4027 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
4028 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
4029
4030 int exec_spawn(Unit *unit,
4031                ExecCommand *command,
4032                const ExecContext *context,
4033                const ExecParameters *params,
4034                ExecRuntime *runtime,
4035                DynamicCreds *dcreds,
4036                pid_t *ret) {
4037
4038         int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
4039         _cleanup_free_ char *subcgroup_path = NULL;
4040         _cleanup_strv_free_ char **files_env = NULL;
4041         size_t n_storage_fds = 0, n_socket_fds = 0;
4042         _cleanup_free_ char *line = NULL;
4043         pid_t pid;
4044
4045         assert(unit);
4046         assert(command);
4047         assert(context);
4048         assert(ret);
4049         assert(params);
4050         assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
4051
4052         if (context->std_input == EXEC_INPUT_SOCKET ||
4053             context->std_output == EXEC_OUTPUT_SOCKET ||
4054             context->std_error == EXEC_OUTPUT_SOCKET) {
4055
4056                 if (params->n_socket_fds > 1) {
4057                         log_unit_error(unit, "Got more than one socket.");
4058                         return -EINVAL;
4059                 }
4060
4061                 if (params->n_socket_fds == 0) {
4062                         log_unit_error(unit, "Got no socket.");
4063                         return -EINVAL;
4064                 }
4065
4066                 socket_fd = params->fds[0];
4067         } else {
4068                 socket_fd = -1;
4069                 fds = params->fds;
4070                 n_socket_fds = params->n_socket_fds;
4071                 n_storage_fds = params->n_storage_fds;
4072         }
4073
4074         r = exec_context_named_iofds(context, params, named_iofds);
4075         if (r < 0)
4076                 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
4077
4078         r = exec_context_load_environment(unit, context, &files_env);
4079         if (r < 0)
4080                 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
4081
4082         line = exec_command_line(command->argv);
4083         if (!line)
4084                 return log_oom();
4085
4086         log_struct(LOG_DEBUG,
4087                    LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
4088                    "EXECUTABLE=%s", command->path,
4089                    LOG_UNIT_ID(unit),
4090                    LOG_UNIT_INVOCATION_ID(unit));
4091
4092         if (params->cgroup_path) {
4093                 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
4094                 if (r < 0)
4095                         return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
4096                 if (r > 0) { /* We are using a child cgroup */
4097                         r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
4098                         if (r < 0)
4099                                 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
4100                 }
4101         }
4102
4103         pid = fork();
4104         if (pid < 0)
4105                 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
4106
4107         if (pid == 0) {
4108                 int exit_status = EXIT_SUCCESS;
4109
4110                 r = exec_child(unit,
4111                                command,
4112                                context,
4113                                params,
4114                                runtime,
4115                                dcreds,
4116                                socket_fd,
4117                                named_iofds,
4118                                fds,
4119                                n_socket_fds,
4120                                n_storage_fds,
4121                                files_env,
4122                                unit->manager->user_lookup_fds[1],
4123                                &exit_status);
4124
4125                 if (r < 0) {
4126                         const char *status =
4127                                 exit_status_to_string(exit_status,
4128                                                       EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
4129
4130                         log_struct_errno(LOG_ERR, r,
4131                                          "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4132                                          LOG_UNIT_ID(unit),
4133                                          LOG_UNIT_INVOCATION_ID(unit),
4134                                          LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
4135                                                           status, command->path),
4136                                          "EXECUTABLE=%s", command->path);
4137                 }
4138
4139                 _exit(exit_status);
4140         }
4141
4142         log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
4143
4144         /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
4145          * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
4146          * process will be killed too). */
4147         if (subcgroup_path)
4148                 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
4149
4150         exec_status_start(&command->exec_status, pid);
4151
4152         *ret = pid;
4153         return 0;
4154 }
4155
4156 void exec_context_init(ExecContext *c) {
4157         ExecDirectoryType i;
4158
4159         assert(c);
4160
4161         c->umask = 0022;
4162         c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
4163         c->cpu_sched_policy = SCHED_OTHER;
4164         c->syslog_priority = LOG_DAEMON|LOG_INFO;
4165         c->syslog_level_prefix = true;
4166         c->ignore_sigpipe = true;
4167         c->timer_slack_nsec = NSEC_INFINITY;
4168         c->personality = PERSONALITY_INVALID;
4169         for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
4170                 c->directories[i].mode = 0755;
4171         c->timeout_clean_usec = USEC_INFINITY;
4172         c->capability_bounding_set = CAP_ALL;
4173         assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
4174         c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
4175         c->log_level_max = -1;
4176         numa_policy_reset(&c->numa_policy);
4177 }
4178
4179 void exec_context_done(ExecContext *c) {
4180         ExecDirectoryType i;
4181         size_t l;
4182
4183         assert(c);
4184
4185         c->environment = strv_free(c->environment);
4186         c->environment_files = strv_free(c->environment_files);
4187         c->pass_environment = strv_free(c->pass_environment);
4188         c->unset_environment = strv_free(c->unset_environment);
4189
4190         rlimit_free_all(c->rlimit);
4191
4192         for (l = 0; l < 3; l++) {
4193                 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
4194                 c->stdio_file[l] = mfree(c->stdio_file[l]);
4195         }
4196
4197         c->working_directory = mfree(c->working_directory);
4198         c->root_directory = mfree(c->root_directory);
4199         c->root_image = mfree(c->root_image);
4200         c->root_hash = mfree(c->root_hash);
4201         c->root_hash_size = 0;
4202         c->root_hash_path = mfree(c->root_hash_path);
4203         c->root_verity = mfree(c->root_verity);
4204         c->tty_path = mfree(c->tty_path);
4205         c->syslog_identifier = mfree(c->syslog_identifier);
4206         c->user = mfree(c->user);
4207         c->group = mfree(c->group);
4208
4209         c->supplementary_groups = strv_free(c->supplementary_groups);
4210
4211         c->pam_name = mfree(c->pam_name);
4212
4213         c->read_only_paths = strv_free(c->read_only_paths);
4214         c->read_write_paths = strv_free(c->read_write_paths);
4215         c->inaccessible_paths = strv_free(c->inaccessible_paths);
4216
4217         bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
4218         c->bind_mounts = NULL;
4219         c->n_bind_mounts = 0;
4220         temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
4221         c->temporary_filesystems = NULL;
4222         c->n_temporary_filesystems = 0;
4223
4224         cpu_set_reset(&c->cpu_set);
4225         numa_policy_reset(&c->numa_policy);
4226
4227         c->utmp_id = mfree(c->utmp_id);
4228         c->selinux_context = mfree(c->selinux_context);
4229         c->apparmor_profile = mfree(c->apparmor_profile);
4230         c->smack_process_label = mfree(c->smack_process_label);
4231
4232         c->syscall_filter = hashmap_free(c->syscall_filter);
4233         c->syscall_archs = set_free(c->syscall_archs);
4234         c->address_families = set_free(c->address_families);
4235
4236         for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
4237                 c->directories[i].paths = strv_free(c->directories[i].paths);
4238
4239         c->log_level_max = -1;
4240
4241         exec_context_free_log_extra_fields(c);
4242
4243         c->log_ratelimit_interval_usec = 0;
4244         c->log_ratelimit_burst = 0;
4245
4246         c->stdin_data = mfree(c->stdin_data);
4247         c->stdin_data_size = 0;
4248
4249         c->network_namespace_path = mfree(c->network_namespace_path);
4250
4251         c->log_namespace = mfree(c->log_namespace);
4252 }
4253
4254 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
4255         char **i;
4256
4257         assert(c);
4258
4259         if (!runtime_prefix)
4260                 return 0;
4261
4262         STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
4263                 _cleanup_free_ char *p;
4264
4265                 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
4266                         p = path_join(runtime_prefix, "private", *i);
4267                 else
4268                         p = path_join(runtime_prefix, *i);
4269                 if (!p)
4270                         return -ENOMEM;
4271
4272                 /* We execute this synchronously, since we need to be sure this is gone when we start the
4273                  * service next. */
4274                 (void) rm_rf(p, REMOVE_ROOT);
4275         }
4276
4277         return 0;
4278 }
4279
4280 static void exec_command_done(ExecCommand *c) {
4281         assert(c);
4282
4283         c->path = mfree(c->path);
4284         c->argv = strv_free(c->argv);
4285 }
4286
4287 void exec_command_done_array(ExecCommand *c, size_t n) {
4288         size_t i;
4289
4290         for (i = 0; i < n; i++)
4291                 exec_command_done(c+i);
4292 }
4293
4294 ExecCommand* exec_command_free_list(ExecCommand *c) {
4295         ExecCommand *i;
4296
4297         while ((i = c)) {
4298                 LIST_REMOVE(command, c, i);
4299                 exec_command_done(i);
4300                 free(i);
4301         }
4302
4303         return NULL;
4304 }
4305
4306 void exec_command_free_array(ExecCommand **c, size_t n) {
4307         size_t i;
4308
4309         for (i = 0; i < n; i++)
4310                 c[i] = exec_command_free_list(c[i]);
4311 }
4312
4313 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
4314         size_t i;
4315
4316         for (i = 0; i < n; i++)
4317                 exec_status_reset(&c[i].exec_status);
4318 }
4319
4320 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
4321         size_t i;
4322
4323         for (i = 0; i < n; i++) {
4324                 ExecCommand *z;
4325
4326                 LIST_FOREACH(command, z, c[i])
4327                         exec_status_reset(&z->exec_status);
4328         }
4329 }
4330
4331 typedef struct InvalidEnvInfo {
4332         const Unit *unit;
4333         const char *path;
4334 } InvalidEnvInfo;
4335
4336 static void invalid_env(const char *p, void *userdata) {
4337         InvalidEnvInfo *info = userdata;
4338
4339         log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
4340 }
4341
4342 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
4343         assert(c);
4344
4345         switch (fd_index) {
4346
4347         case STDIN_FILENO:
4348                 if (c->std_input != EXEC_INPUT_NAMED_FD)
4349                         return NULL;
4350
4351                 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
4352
4353         case STDOUT_FILENO:
4354                 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
4355                         return NULL;
4356
4357                 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
4358
4359         case STDERR_FILENO:
4360                 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
4361                         return NULL;
4362
4363                 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
4364
4365         default:
4366                 return NULL;
4367         }
4368 }
4369
4370 static int exec_context_named_iofds(
4371                 const ExecContext *c,
4372                 const ExecParameters *p,
4373                 int named_iofds[static 3]) {
4374
4375         size_t i, targets;
4376         const char* stdio_fdname[3];
4377         size_t n_fds;
4378
4379         assert(c);
4380         assert(p);
4381         assert(named_iofds);
4382
4383         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
4384                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
4385                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
4386
4387         for (i = 0; i < 3; i++)
4388                 stdio_fdname[i] = exec_context_fdname(c, i);
4389
4390         n_fds = p->n_storage_fds + p->n_socket_fds;
4391
4392         for (i = 0; i < n_fds  && targets > 0; i++)
4393                 if (named_iofds[STDIN_FILENO] < 0 &&
4394                     c->std_input == EXEC_INPUT_NAMED_FD &&
4395                     stdio_fdname[STDIN_FILENO] &&
4396                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
4397
4398                         named_iofds[STDIN_FILENO] = p->fds[i];
4399                         targets--;
4400
4401                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
4402                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
4403                            stdio_fdname[STDOUT_FILENO] &&
4404                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
4405
4406                         named_iofds[STDOUT_FILENO] = p->fds[i];
4407                         targets--;
4408
4409                 } else if (named_iofds[STDERR_FILENO] < 0 &&
4410                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
4411                            stdio_fdname[STDERR_FILENO] &&
4412                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
4413
4414                         named_iofds[STDERR_FILENO] = p->fds[i];
4415                         targets--;
4416                 }
4417
4418         return targets == 0 ? 0 : -ENOENT;
4419 }
4420
4421 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
4422         char **i, **r = NULL;
4423
4424         assert(c);
4425         assert(l);
4426
4427         STRV_FOREACH(i, c->environment_files) {
4428                 char *fn;
4429                 int k;
4430                 unsigned n;
4431                 bool ignore = false;
4432                 char **p;
4433                 _cleanup_globfree_ glob_t pglob = {};
4434
4435                 fn = *i;
4436
4437                 if (fn[0] == '-') {
4438                         ignore = true;
4439                         fn++;
4440                 }
4441
4442                 if (!path_is_absolute(fn)) {
4443                         if (ignore)
4444                                 continue;
4445
4446                         strv_free(r);
4447                         return -EINVAL;
4448                 }
4449
4450                 /* Filename supports globbing, take all matching files */
4451                 k = safe_glob(fn, 0, &pglob);
4452                 if (k < 0) {
4453                         if (ignore)
4454                                 continue;
4455
4456                         strv_free(r);
4457                         return k;
4458                 }
4459
4460                 /* When we don't match anything, -ENOENT should be returned */
4461                 assert(pglob.gl_pathc > 0);
4462
4463                 for (n = 0; n < pglob.gl_pathc; n++) {
4464                         k = load_env_file(NULL, pglob.gl_pathv[n], &p);
4465                         if (k < 0) {
4466                                 if (ignore)
4467                                         continue;
4468
4469                                 strv_free(r);
4470                                 return k;
4471                         }
4472                         /* Log invalid environment variables with filename */
4473                         if (p) {
4474                                 InvalidEnvInfo info = {
4475                                         .unit = unit,
4476                                         .path = pglob.gl_pathv[n]
4477                                 };
4478
4479                                 p = strv_env_clean_with_callback(p, invalid_env, &info);
4480                         }
4481
4482                         if (!r)
4483                                 r = p;
4484                         else {
4485                                 char **m;
4486
4487                                 m = strv_env_merge(2, r, p);
4488                                 strv_free(r);
4489                                 strv_free(p);
4490                                 if (!m)
4491                                         return -ENOMEM;
4492
4493                                 r = m;
4494                         }
4495                 }
4496         }
4497
4498         *l = r;
4499
4500         return 0;
4501 }
4502
4503 static bool tty_may_match_dev_console(const char *tty) {
4504         _cleanup_free_ char *resolved = NULL;
4505
4506         if (!tty)
4507                 return true;
4508
4509         tty = skip_dev_prefix(tty);
4510
4511         /* trivial identity? */
4512         if (streq(tty, "console"))
4513                 return true;
4514
4515         if (resolve_dev_console(&resolved) < 0)
4516                 return true; /* if we could not resolve, assume it may */
4517
4518         /* "tty0" means the active VC, so it may be the same sometimes */
4519         return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
4520 }
4521
4522 static bool exec_context_may_touch_tty(const ExecContext *ec) {
4523         assert(ec);
4524
4525         return ec->tty_reset ||
4526                 ec->tty_vhangup ||
4527                 ec->tty_vt_disallocate ||
4528                 is_terminal_input(ec->std_input) ||
4529                 is_terminal_output(ec->std_output) ||
4530                 is_terminal_output(ec->std_error);
4531 }
4532
4533 bool exec_context_may_touch_console(const ExecContext *ec) {
4534
4535         return exec_context_may_touch_tty(ec) &&
4536                tty_may_match_dev_console(exec_context_tty_path(ec));
4537 }
4538
4539 static void strv_fprintf(FILE *f, char **l) {
4540         char **g;
4541
4542         assert(f);
4543
4544         STRV_FOREACH(g, l)
4545                 fprintf(f, " %s", *g);
4546 }
4547
4548 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
4549         char **e, **d, buf_clean[FORMAT_TIMESPAN_MAX];
4550         ExecDirectoryType dt;
4551         unsigned i;
4552         int r;
4553
4554         assert(c);
4555         assert(f);
4556
4557         prefix = strempty(prefix);
4558
4559         fprintf(f,
4560                 "%sUMask: %04o\n"
4561                 "%sWorkingDirectory: %s\n"
4562                 "%sRootDirectory: %s\n"
4563                 "%sNonBlocking: %s\n"
4564                 "%sPrivateTmp: %s\n"
4565                 "%sPrivateDevices: %s\n"
4566                 "%sProtectKernelTunables: %s\n"
4567                 "%sProtectKernelModules: %s\n"
4568                 "%sProtectKernelLogs: %s\n"
4569                 "%sProtectClock: %s\n"
4570                 "%sProtectControlGroups: %s\n"
4571                 "%sPrivateNetwork: %s\n"
4572                 "%sPrivateUsers: %s\n"
4573                 "%sProtectHome: %s\n"
4574                 "%sProtectSystem: %s\n"
4575                 "%sMountAPIVFS: %s\n"
4576                 "%sIgnoreSIGPIPE: %s\n"
4577                 "%sMemoryDenyWriteExecute: %s\n"
4578                 "%sRestrictRealtime: %s\n"
4579                 "%sRestrictSUIDSGID: %s\n"
4580                 "%sKeyringMode: %s\n"
4581                 "%sProtectHostname: %s\n",
4582                 prefix, c->umask,
4583                 prefix, c->working_directory ? c->working_directory : "/",
4584                 prefix, c->root_directory ? c->root_directory : "/",
4585                 prefix, yes_no(c->non_blocking),
4586                 prefix, yes_no(c->private_tmp),
4587                 prefix, yes_no(c->private_devices),
4588                 prefix, yes_no(c->protect_kernel_tunables),
4589                 prefix, yes_no(c->protect_kernel_modules),
4590                 prefix, yes_no(c->protect_kernel_logs),
4591                 prefix, yes_no(c->protect_clock),
4592                 prefix, yes_no(c->protect_control_groups),
4593                 prefix, yes_no(c->private_network),
4594                 prefix, yes_no(c->private_users),
4595                 prefix, protect_home_to_string(c->protect_home),
4596                 prefix, protect_system_to_string(c->protect_system),
4597                 prefix, yes_no(c->mount_apivfs),
4598                 prefix, yes_no(c->ignore_sigpipe),
4599                 prefix, yes_no(c->memory_deny_write_execute),
4600                 prefix, yes_no(c->restrict_realtime),
4601                 prefix, yes_no(c->restrict_suid_sgid),
4602                 prefix, exec_keyring_mode_to_string(c->keyring_mode),
4603                 prefix, yes_no(c->protect_hostname));
4604
4605         if (c->root_image)
4606                 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
4607
4608         if (c->root_hash) {
4609                 _cleanup_free_ char *encoded = NULL;
4610                 encoded = hexmem(c->root_hash, c->root_hash_size);
4611                 if (encoded)
4612                         fprintf(f, "%sRootHash: %s\n", prefix, encoded);
4613         }
4614
4615         if (c->root_hash_path)
4616                 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
4617
4618         if (c->root_verity)
4619                 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
4620
4621         STRV_FOREACH(e, c->environment)
4622                 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
4623
4624         STRV_FOREACH(e, c->environment_files)
4625                 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
4626
4627         STRV_FOREACH(e, c->pass_environment)
4628                 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
4629
4630         STRV_FOREACH(e, c->unset_environment)
4631                 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
4632
4633         fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
4634
4635         for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4636                 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
4637
4638                 STRV_FOREACH(d, c->directories[dt].paths)
4639                         fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
4640         }
4641
4642         fprintf(f,
4643                 "%sTimeoutCleanSec: %s\n",
4644                 prefix, format_timespan(buf_clean, sizeof(buf_clean), c->timeout_clean_usec, USEC_PER_SEC));
4645
4646         if (c->nice_set)
4647                 fprintf(f,
4648                         "%sNice: %i\n",
4649                         prefix, c->nice);
4650
4651         if (c->oom_score_adjust_set)
4652                 fprintf(f,
4653                         "%sOOMScoreAdjust: %i\n",
4654                         prefix, c->oom_score_adjust);
4655
4656         if (c->coredump_filter_set)
4657                 fprintf(f,
4658                         "%sCoredumpFilter: 0x%"PRIx64"\n",
4659                         prefix, c->coredump_filter);
4660
4661         for (i = 0; i < RLIM_NLIMITS; i++)
4662                 if (c->rlimit[i]) {
4663                         fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
4664                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
4665                         fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
4666                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
4667                 }
4668
4669         if (c->ioprio_set) {
4670                 _cleanup_free_ char *class_str = NULL;
4671
4672                 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
4673                 if (r >= 0)
4674                         fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
4675
4676                 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
4677         }
4678
4679         if (c->cpu_sched_set) {
4680                 _cleanup_free_ char *policy_str = NULL;
4681
4682                 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
4683                 if (r >= 0)
4684                         fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
4685
4686                 fprintf(f,
4687                         "%sCPUSchedulingPriority: %i\n"
4688                         "%sCPUSchedulingResetOnFork: %s\n",
4689                         prefix, c->cpu_sched_priority,
4690                         prefix, yes_no(c->cpu_sched_reset_on_fork));
4691         }
4692
4693         if (c->cpu_set.set) {
4694                 _cleanup_free_ char *affinity = NULL;
4695
4696                 affinity = cpu_set_to_range_string(&c->cpu_set);
4697                 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
4698         }
4699
4700         if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
4701                 _cleanup_free_ char *nodes = NULL;
4702
4703                 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
4704                 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
4705                 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
4706         }
4707
4708         if (c->timer_slack_nsec != NSEC_INFINITY)
4709                 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
4710
4711         fprintf(f,
4712                 "%sStandardInput: %s\n"
4713                 "%sStandardOutput: %s\n"
4714                 "%sStandardError: %s\n",
4715                 prefix, exec_input_to_string(c->std_input),
4716                 prefix, exec_output_to_string(c->std_output),
4717                 prefix, exec_output_to_string(c->std_error));
4718
4719         if (c->std_input == EXEC_INPUT_NAMED_FD)
4720                 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
4721         if (c->std_output == EXEC_OUTPUT_NAMED_FD)
4722                 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
4723         if (c->std_error == EXEC_OUTPUT_NAMED_FD)
4724                 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
4725
4726         if (c->std_input == EXEC_INPUT_FILE)
4727                 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
4728         if (c->std_output == EXEC_OUTPUT_FILE)
4729                 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4730         if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
4731                 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4732         if (c->std_error == EXEC_OUTPUT_FILE)
4733                 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4734         if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
4735                 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4736
4737         if (c->tty_path)
4738                 fprintf(f,
4739                         "%sTTYPath: %s\n"
4740                         "%sTTYReset: %s\n"
4741                         "%sTTYVHangup: %s\n"
4742                         "%sTTYVTDisallocate: %s\n",
4743                         prefix, c->tty_path,
4744                         prefix, yes_no(c->tty_reset),
4745                         prefix, yes_no(c->tty_vhangup),
4746                         prefix, yes_no(c->tty_vt_disallocate));
4747
4748         if (IN_SET(c->std_output,
4749                    EXEC_OUTPUT_KMSG,
4750                    EXEC_OUTPUT_JOURNAL,
4751                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
4752                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
4753             IN_SET(c->std_error,
4754                    EXEC_OUTPUT_KMSG,
4755                    EXEC_OUTPUT_JOURNAL,
4756                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
4757                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
4758
4759                 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
4760
4761                 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
4762                 if (r >= 0)
4763                         fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
4764
4765                 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
4766                 if (r >= 0)
4767                         fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
4768         }
4769
4770         if (c->log_level_max >= 0) {
4771                 _cleanup_free_ char *t = NULL;
4772
4773                 (void) log_level_to_string_alloc(c->log_level_max, &t);
4774
4775                 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
4776         }
4777
4778         if (c->log_ratelimit_interval_usec > 0) {
4779                 char buf_timespan[FORMAT_TIMESPAN_MAX];
4780
4781                 fprintf(f,
4782                         "%sLogRateLimitIntervalSec: %s\n",
4783                         prefix, format_timespan(buf_timespan, sizeof(buf_timespan), c->log_ratelimit_interval_usec, USEC_PER_SEC));
4784         }
4785
4786         if (c->log_ratelimit_burst > 0)
4787                 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
4788
4789         if (c->n_log_extra_fields > 0) {
4790                 size_t j;
4791
4792                 for (j = 0; j < c->n_log_extra_fields; j++) {
4793                         fprintf(f, "%sLogExtraFields: ", prefix);
4794                         fwrite(c->log_extra_fields[j].iov_base,
4795                                1, c->log_extra_fields[j].iov_len,
4796                                f);
4797                         fputc('\n', f);
4798                 }
4799         }
4800
4801         if (c->log_namespace)
4802                 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
4803
4804         if (c->secure_bits) {
4805                 _cleanup_free_ char *str = NULL;
4806
4807                 r = secure_bits_to_string_alloc(c->secure_bits, &str);
4808                 if (r >= 0)
4809                         fprintf(f, "%sSecure Bits: %s\n", prefix, str);
4810         }
4811
4812         if (c->capability_bounding_set != CAP_ALL) {
4813                 _cleanup_free_ char *str = NULL;
4814
4815                 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
4816                 if (r >= 0)
4817                         fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
4818         }
4819
4820         if (c->capability_ambient_set != 0) {
4821                 _cleanup_free_ char *str = NULL;
4822
4823                 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
4824                 if (r >= 0)
4825                         fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
4826         }
4827
4828         if (c->user)
4829                 fprintf(f, "%sUser: %s\n", prefix, c->user);
4830         if (c->group)
4831                 fprintf(f, "%sGroup: %s\n", prefix, c->group);
4832
4833         fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
4834
4835         if (!strv_isempty(c->supplementary_groups)) {
4836                 fprintf(f, "%sSupplementaryGroups:", prefix);
4837                 strv_fprintf(f, c->supplementary_groups);
4838                 fputs("\n", f);
4839         }
4840
4841         if (c->pam_name)
4842                 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
4843
4844         if (!strv_isempty(c->read_write_paths)) {
4845                 fprintf(f, "%sReadWritePaths:", prefix);
4846                 strv_fprintf(f, c->read_write_paths);
4847                 fputs("\n", f);
4848         }
4849
4850         if (!strv_isempty(c->read_only_paths)) {
4851                 fprintf(f, "%sReadOnlyPaths:", prefix);
4852                 strv_fprintf(f, c->read_only_paths);
4853                 fputs("\n", f);
4854         }
4855
4856         if (!strv_isempty(c->inaccessible_paths)) {
4857                 fprintf(f, "%sInaccessiblePaths:", prefix);
4858                 strv_fprintf(f, c->inaccessible_paths);
4859                 fputs("\n", f);
4860         }
4861
4862         if (c->n_bind_mounts > 0)
4863                 for (i = 0; i < c->n_bind_mounts; i++)
4864                         fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
4865                                 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
4866                                 c->bind_mounts[i].ignore_enoent ? "-": "",
4867                                 c->bind_mounts[i].source,
4868                                 c->bind_mounts[i].destination,
4869                                 c->bind_mounts[i].recursive ? "rbind" : "norbind");
4870
4871         if (c->n_temporary_filesystems > 0)
4872                 for (i = 0; i < c->n_temporary_filesystems; i++) {
4873                         TemporaryFileSystem *t = c->temporary_filesystems + i;
4874
4875                         fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
4876                                 t->path,
4877                                 isempty(t->options) ? "" : ":",
4878                                 strempty(t->options));
4879                 }
4880
4881         if (c->utmp_id)
4882                 fprintf(f,
4883                         "%sUtmpIdentifier: %s\n",
4884                         prefix, c->utmp_id);
4885
4886         if (c->selinux_context)
4887                 fprintf(f,
4888                         "%sSELinuxContext: %s%s\n",
4889                         prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
4890
4891         if (c->apparmor_profile)
4892                 fprintf(f,
4893                         "%sAppArmorProfile: %s%s\n",
4894                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4895
4896         if (c->smack_process_label)
4897                 fprintf(f,
4898                         "%sSmackProcessLabel: %s%s\n",
4899                         prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
4900
4901         if (c->personality != PERSONALITY_INVALID)
4902                 fprintf(f,
4903                         "%sPersonality: %s\n",
4904                         prefix, strna(personality_to_string(c->personality)));
4905
4906         fprintf(f,
4907                 "%sLockPersonality: %s\n",
4908                 prefix, yes_no(c->lock_personality));
4909
4910         if (c->syscall_filter) {
4911 #if HAVE_SECCOMP
4912                 Iterator j;
4913                 void *id, *val;
4914                 bool first = true;
4915 #endif
4916
4917                 fprintf(f,
4918                         "%sSystemCallFilter: ",
4919                         prefix);
4920
4921                 if (!c->syscall_allow_list)
4922                         fputc('~', f);
4923
4924 #if HAVE_SECCOMP
4925                 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter, j) {
4926                         _cleanup_free_ char *name = NULL;
4927                         const char *errno_name = NULL;
4928                         int num = PTR_TO_INT(val);
4929
4930                         if (first)
4931                                 first = false;
4932                         else
4933                                 fputc(' ', f);
4934
4935                         name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
4936                         fputs(strna(name), f);
4937
4938                         if (num >= 0) {
4939                                 errno_name = errno_to_name(num);
4940                                 if (errno_name)
4941                                         fprintf(f, ":%s", errno_name);
4942                                 else
4943                                         fprintf(f, ":%d", num);
4944                         }
4945                 }
4946 #endif
4947
4948                 fputc('\n', f);
4949         }
4950
4951         if (c->syscall_archs) {
4952 #if HAVE_SECCOMP
4953                 Iterator j;
4954                 void *id;
4955 #endif
4956
4957                 fprintf(f,
4958                         "%sSystemCallArchitectures:",
4959                         prefix);
4960
4961 #if HAVE_SECCOMP
4962                 SET_FOREACH(id, c->syscall_archs, j)
4963                         fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
4964 #endif
4965                 fputc('\n', f);
4966         }
4967
4968         if (exec_context_restrict_namespaces_set(c)) {
4969                 _cleanup_free_ char *s = NULL;
4970
4971                 r = namespace_flags_to_string(c->restrict_namespaces, &s);
4972                 if (r >= 0)
4973                         fprintf(f, "%sRestrictNamespaces: %s\n",
4974                                 prefix, strna(s));
4975         }
4976
4977         if (c->network_namespace_path)
4978                 fprintf(f,
4979                         "%sNetworkNamespacePath: %s\n",
4980                         prefix, c->network_namespace_path);
4981
4982         if (c->syscall_errno > 0) {
4983                 const char *errno_name;
4984
4985                 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
4986
4987                 errno_name = errno_to_name(c->syscall_errno);
4988                 if (errno_name)
4989                         fprintf(f, "%s\n", errno_name);
4990                 else
4991                         fprintf(f, "%d\n", c->syscall_errno);
4992         }
4993 }
4994
4995 bool exec_context_maintains_privileges(const ExecContext *c) {
4996         assert(c);
4997
4998         /* Returns true if the process forked off would run under
4999          * an unchanged UID or as root. */
5000
5001         if (!c->user)
5002                 return true;
5003
5004         if (streq(c->user, "root") || streq(c->user, "0"))
5005                 return true;
5006
5007         return false;
5008 }
5009
5010 int exec_context_get_effective_ioprio(const ExecContext *c) {
5011         int p;
5012
5013         assert(c);
5014
5015         if (c->ioprio_set)
5016                 return c->ioprio;
5017
5018         p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
5019         if (p < 0)
5020                 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
5021
5022         return p;
5023 }
5024
5025 void exec_context_free_log_extra_fields(ExecContext *c) {
5026         size_t l;
5027
5028         assert(c);
5029
5030         for (l = 0; l < c->n_log_extra_fields; l++)
5031                 free(c->log_extra_fields[l].iov_base);
5032         c->log_extra_fields = mfree(c->log_extra_fields);
5033         c->n_log_extra_fields = 0;
5034 }
5035
5036 void exec_context_revert_tty(ExecContext *c) {
5037         int r;
5038
5039         assert(c);
5040
5041         /* First, reset the TTY (possibly kicking everybody else from the TTY) */
5042         exec_context_tty_reset(c, NULL);
5043
5044         /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
5045          * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
5046          * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
5047
5048         if (exec_context_may_touch_tty(c)) {
5049                 const char *path;
5050
5051                 path = exec_context_tty_path(c);
5052                 if (path) {
5053                         r = chmod_and_chown(path, TTY_MODE, 0, TTY_GID);
5054                         if (r < 0 && r != -ENOENT)
5055                                 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
5056                 }
5057         }
5058 }
5059
5060 int exec_context_get_clean_directories(
5061                 ExecContext *c,
5062                 char **prefix,
5063                 ExecCleanMask mask,
5064                 char ***ret) {
5065
5066         _cleanup_strv_free_ char **l = NULL;
5067         ExecDirectoryType t;
5068         int r;
5069
5070         assert(c);
5071         assert(prefix);
5072         assert(ret);
5073
5074         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
5075                 char **i;
5076
5077                 if (!FLAGS_SET(mask, 1U << t))
5078                         continue;
5079
5080                 if (!prefix[t])
5081                         continue;
5082
5083                 STRV_FOREACH(i, c->directories[t].paths) {
5084                         char *j;
5085
5086                         j = path_join(prefix[t], *i);
5087                         if (!j)
5088                                 return -ENOMEM;
5089
5090                         r = strv_consume(&l, j);
5091                         if (r < 0)
5092                                 return r;
5093
5094                         /* Also remove private directories unconditionally. */
5095                         if (t != EXEC_DIRECTORY_CONFIGURATION) {
5096                                 j = path_join(prefix[t], "private", *i);
5097                                 if (!j)
5098                                         return -ENOMEM;
5099
5100                                 r = strv_consume(&l, j);
5101                                 if (r < 0)
5102                                         return r;
5103                         }
5104                 }
5105         }
5106
5107         *ret = TAKE_PTR(l);
5108         return 0;
5109 }
5110
5111 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
5112         ExecCleanMask mask = 0;
5113
5114         assert(c);
5115         assert(ret);
5116
5117         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5118                 if (!strv_isempty(c->directories[t].paths))
5119                         mask |= 1U << t;
5120
5121         *ret = mask;
5122         return 0;
5123 }
5124
5125 void exec_status_start(ExecStatus *s, pid_t pid) {
5126         assert(s);
5127
5128         *s = (ExecStatus) {
5129                 .pid = pid,
5130         };
5131
5132         dual_timestamp_get(&s->start_timestamp);
5133 }
5134
5135 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
5136         assert(s);
5137
5138         if (s->pid != pid) {
5139                 *s = (ExecStatus) {
5140                         .pid = pid,
5141                 };
5142         }
5143
5144         dual_timestamp_get(&s->exit_timestamp);
5145
5146         s->code = code;
5147         s->status = status;
5148
5149         if (context && context->utmp_id)
5150                 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
5151 }
5152
5153 void exec_status_reset(ExecStatus *s) {
5154         assert(s);
5155
5156         *s = (ExecStatus) {};
5157 }
5158
5159 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
5160         char buf[FORMAT_TIMESTAMP_MAX];
5161
5162         assert(s);
5163         assert(f);
5164
5165         if (s->pid <= 0)
5166                 return;
5167
5168         prefix = strempty(prefix);
5169
5170         fprintf(f,
5171                 "%sPID: "PID_FMT"\n",
5172                 prefix, s->pid);
5173
5174         if (dual_timestamp_is_set(&s->start_timestamp))
5175                 fprintf(f,
5176                         "%sStart Timestamp: %s\n",
5177                         prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
5178
5179         if (dual_timestamp_is_set(&s->exit_timestamp))
5180                 fprintf(f,
5181                         "%sExit Timestamp: %s\n"
5182                         "%sExit Code: %s\n"
5183                         "%sExit Status: %i\n",
5184                         prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
5185                         prefix, sigchld_code_to_string(s->code),
5186                         prefix, s->status);
5187 }
5188
5189 static char *exec_command_line(char **argv) {
5190         size_t k;
5191         char *n, *p, **a;
5192         bool first = true;
5193
5194         assert(argv);
5195
5196         k = 1;
5197         STRV_FOREACH(a, argv)
5198                 k += strlen(*a)+3;
5199
5200         n = new(char, k);
5201         if (!n)
5202                 return NULL;
5203
5204         p = n;
5205         STRV_FOREACH(a, argv) {
5206
5207                 if (!first)
5208                         *(p++) = ' ';
5209                 else
5210                         first = false;
5211
5212                 if (strpbrk(*a, WHITESPACE)) {
5213                         *(p++) = '\'';
5214                         p = stpcpy(p, *a);
5215                         *(p++) = '\'';
5216                 } else
5217                         p = stpcpy(p, *a);
5218
5219         }
5220
5221         *p = 0;
5222
5223         /* FIXME: this doesn't really handle arguments that have
5224          * spaces and ticks in them */
5225
5226         return n;
5227 }
5228
5229 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
5230         _cleanup_free_ char *cmd = NULL;
5231         const char *prefix2;
5232
5233         assert(c);
5234         assert(f);
5235
5236         prefix = strempty(prefix);
5237         prefix2 = strjoina(prefix, "\t");
5238
5239         cmd = exec_command_line(c->argv);
5240         fprintf(f,
5241                 "%sCommand Line: %s\n",
5242                 prefix, cmd ? cmd : strerror_safe(ENOMEM));
5243
5244         exec_status_dump(&c->exec_status, f, prefix2);
5245 }
5246
5247 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
5248         assert(f);
5249
5250         prefix = strempty(prefix);
5251
5252         LIST_FOREACH(command, c, c)
5253                 exec_command_dump(c, f, prefix);
5254 }
5255
5256 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
5257         ExecCommand *end;
5258
5259         assert(l);
5260         assert(e);
5261
5262         if (*l) {
5263                 /* It's kind of important, that we keep the order here */
5264                 LIST_FIND_TAIL(command, *l, end);
5265                 LIST_INSERT_AFTER(command, *l, end, e);
5266         } else
5267               *l = e;
5268 }
5269
5270 int exec_command_set(ExecCommand *c, const char *path, ...) {
5271         va_list ap;
5272         char **l, *p;
5273
5274         assert(c);
5275         assert(path);
5276
5277         va_start(ap, path);
5278         l = strv_new_ap(path, ap);
5279         va_end(ap);
5280
5281         if (!l)
5282                 return -ENOMEM;
5283
5284         p = strdup(path);
5285         if (!p) {
5286                 strv_free(l);
5287                 return -ENOMEM;
5288         }
5289
5290         free_and_replace(c->path, p);
5291
5292         return strv_free_and_replace(c->argv, l);
5293 }
5294
5295 int exec_command_append(ExecCommand *c, const char *path, ...) {
5296         _cleanup_strv_free_ char **l = NULL;
5297         va_list ap;
5298         int r;
5299
5300         assert(c);
5301         assert(path);
5302
5303         va_start(ap, path);
5304         l = strv_new_ap(path, ap);
5305         va_end(ap);
5306
5307         if (!l)
5308                 return -ENOMEM;
5309
5310         r = strv_extend_strv(&c->argv, l, false);
5311         if (r < 0)
5312                 return r;
5313
5314         return 0;
5315 }
5316
5317 static void *remove_tmpdir_thread(void *p) {
5318         _cleanup_free_ char *path = p;
5319
5320         (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
5321         return NULL;
5322 }
5323
5324 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
5325         int r;
5326
5327         if (!rt)
5328                 return NULL;
5329
5330         if (rt->manager)
5331                 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
5332
5333         /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
5334         if (destroy && rt->tmp_dir) {
5335                 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
5336
5337                 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
5338                 if (r < 0) {
5339                         log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
5340                         free(rt->tmp_dir);
5341                 }
5342
5343                 rt->tmp_dir = NULL;
5344         }
5345
5346         if (destroy && rt->var_tmp_dir) {
5347                 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
5348
5349                 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
5350                 if (r < 0) {
5351                         log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
5352                         free(rt->var_tmp_dir);
5353                 }
5354
5355                 rt->var_tmp_dir = NULL;
5356         }
5357
5358         rt->id = mfree(rt->id);
5359         rt->tmp_dir = mfree(rt->tmp_dir);
5360         rt->var_tmp_dir = mfree(rt->var_tmp_dir);
5361         safe_close_pair(rt->netns_storage_socket);
5362         return mfree(rt);
5363 }
5364
5365 static void exec_runtime_freep(ExecRuntime **rt) {
5366         (void) exec_runtime_free(*rt, false);
5367 }
5368
5369 static int exec_runtime_allocate(ExecRuntime **ret) {
5370         ExecRuntime *n;
5371
5372         assert(ret);
5373
5374         n = new(ExecRuntime, 1);
5375         if (!n)
5376                 return -ENOMEM;
5377
5378         *n = (ExecRuntime) {
5379                 .netns_storage_socket = { -1, -1 },
5380         };
5381
5382         *ret = n;
5383         return 0;
5384 }
5385
5386 static int exec_runtime_add(
5387                 Manager *m,
5388                 const char *id,
5389                 const char *tmp_dir,
5390                 const char *var_tmp_dir,
5391                 const int netns_storage_socket[2],
5392                 ExecRuntime **ret) {
5393
5394         _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
5395         int r;
5396
5397         assert(m);
5398         assert(id);
5399
5400         r = hashmap_ensure_allocated(&m->exec_runtime_by_id, &string_hash_ops);
5401         if (r < 0)
5402                 return r;
5403
5404         r = exec_runtime_allocate(&rt);
5405         if (r < 0)
5406                 return r;
5407
5408         rt->id = strdup(id);
5409         if (!rt->id)
5410                 return -ENOMEM;
5411
5412         if (tmp_dir) {
5413                 rt->tmp_dir = strdup(tmp_dir);
5414                 if (!rt->tmp_dir)
5415                         return -ENOMEM;
5416
5417                 /* When tmp_dir is set, then we require var_tmp_dir is also set. */
5418                 assert(var_tmp_dir);
5419                 rt->var_tmp_dir = strdup(var_tmp_dir);
5420                 if (!rt->var_tmp_dir)
5421                         return -ENOMEM;
5422         }
5423
5424         if (netns_storage_socket) {
5425                 rt->netns_storage_socket[0] = netns_storage_socket[0];
5426                 rt->netns_storage_socket[1] = netns_storage_socket[1];
5427         }
5428
5429         r = hashmap_put(m->exec_runtime_by_id, rt->id, rt);
5430         if (r < 0)
5431                 return r;
5432
5433         rt->manager = m;
5434
5435         if (ret)
5436                 *ret = rt;
5437
5438         /* do not remove created ExecRuntime object when the operation succeeds. */
5439         rt = NULL;
5440         return 0;
5441 }
5442
5443 static int exec_runtime_make(Manager *m, const ExecContext *c, const char *id, ExecRuntime **ret) {
5444         _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
5445         _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 };
5446         int r;
5447
5448         assert(m);
5449         assert(c);
5450         assert(id);
5451
5452         /* It is not necessary to create ExecRuntime object. */
5453         if (!c->private_network && !c->private_tmp && !c->network_namespace_path)
5454                 return 0;
5455
5456         if (c->private_tmp &&
5457             !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
5458               (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
5459                prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
5460                 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
5461                 if (r < 0)
5462                         return r;
5463         }
5464
5465         if (c->private_network || c->network_namespace_path) {
5466                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
5467                         return -errno;
5468         }
5469
5470         r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, netns_storage_socket, ret);
5471         if (r < 0)
5472                 return r;
5473
5474         /* Avoid cleanup */
5475         netns_storage_socket[0] = netns_storage_socket[1] = -1;
5476         return 1;
5477 }
5478
5479 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
5480         ExecRuntime *rt;
5481         int r;
5482
5483         assert(m);
5484         assert(id);
5485         assert(ret);
5486
5487         rt = hashmap_get(m->exec_runtime_by_id, id);
5488         if (rt)
5489                 /* We already have a ExecRuntime object, let's increase the ref count and reuse it */
5490                 goto ref;
5491
5492         if (!create)
5493                 return 0;
5494
5495         /* If not found, then create a new object. */
5496         r = exec_runtime_make(m, c, id, &rt);
5497         if (r <= 0)
5498                 /* When r == 0, it is not necessary to create ExecRuntime object. */
5499                 return r;
5500
5501 ref:
5502         /* increment reference counter. */
5503         rt->n_ref++;
5504         *ret = rt;
5505         return 1;
5506 }
5507
5508 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
5509         if (!rt)
5510                 return NULL;
5511
5512         assert(rt->n_ref > 0);
5513
5514         rt->n_ref--;
5515         if (rt->n_ref > 0)
5516                 return NULL;
5517
5518         return exec_runtime_free(rt, destroy);
5519 }
5520
5521 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
5522         ExecRuntime *rt;
5523         Iterator i;
5524
5525         assert(m);
5526         assert(f);
5527         assert(fds);
5528
5529         HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
5530                 fprintf(f, "exec-runtime=%s", rt->id);
5531
5532                 if (rt->tmp_dir)
5533                         fprintf(f, " tmp-dir=%s", rt->tmp_dir);
5534
5535                 if (rt->var_tmp_dir)
5536                         fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
5537
5538                 if (rt->netns_storage_socket[0] >= 0) {
5539                         int copy;
5540
5541                         copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
5542                         if (copy < 0)
5543                                 return copy;
5544
5545                         fprintf(f, " netns-socket-0=%i", copy);
5546                 }
5547
5548                 if (rt->netns_storage_socket[1] >= 0) {
5549                         int copy;
5550
5551                         copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
5552                         if (copy < 0)
5553                                 return copy;
5554
5555                         fprintf(f, " netns-socket-1=%i", copy);
5556                 }
5557
5558                 fputc('\n', f);
5559         }
5560
5561         return 0;
5562 }
5563
5564 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
5565         _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
5566         ExecRuntime *rt;
5567         int r;
5568
5569         /* This is for the migration from old (v237 or earlier) deserialization text.
5570          * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
5571          * Even if the ExecRuntime object originally created by the other unit, we cannot judge
5572          * so or not from the serialized text, then we always creates a new object owned by this. */
5573
5574         assert(u);
5575         assert(key);
5576         assert(value);
5577
5578         /* Manager manages ExecRuntime objects by the unit id.
5579          * So, we omit the serialized text when the unit does not have id (yet?)... */
5580         if (isempty(u->id)) {
5581                 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
5582                 return 0;
5583         }
5584
5585         r = hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops);
5586         if (r < 0) {
5587                 log_unit_debug_errno(u, r, "Failed to allocate storage for runtime parameter: %m");
5588                 return 0;
5589         }
5590
5591         rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
5592         if (!rt) {
5593                 r = exec_runtime_allocate(&rt_create);
5594                 if (r < 0)
5595                         return log_oom();
5596
5597                 rt_create->id = strdup(u->id);
5598                 if (!rt_create->id)
5599                         return log_oom();
5600
5601                 rt = rt_create;
5602         }
5603
5604         if (streq(key, "tmp-dir")) {
5605                 char *copy;
5606
5607                 copy = strdup(value);
5608                 if (!copy)
5609                         return log_oom();
5610
5611                 free_and_replace(rt->tmp_dir, copy);
5612
5613         } else if (streq(key, "var-tmp-dir")) {
5614                 char *copy;
5615
5616                 copy = strdup(value);
5617                 if (!copy)
5618                         return log_oom();
5619
5620                 free_and_replace(rt->var_tmp_dir, copy);
5621
5622         } else if (streq(key, "netns-socket-0")) {
5623                 int fd;
5624
5625                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
5626                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
5627                         return 0;
5628                 }
5629
5630                 safe_close(rt->netns_storage_socket[0]);
5631                 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
5632
5633         } else if (streq(key, "netns-socket-1")) {
5634                 int fd;
5635
5636                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
5637                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
5638                         return 0;
5639                 }
5640
5641                 safe_close(rt->netns_storage_socket[1]);
5642                 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
5643         } else
5644                 return 0;
5645
5646         /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
5647         if (rt_create) {
5648                 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
5649                 if (r < 0) {
5650                         log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
5651                         return 0;
5652                 }
5653
5654                 rt_create->manager = u->manager;
5655
5656                 /* Avoid cleanup */
5657                 rt_create = NULL;
5658         }
5659
5660         return 1;
5661 }
5662
5663 void exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
5664         char *id = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
5665         int r, fd0 = -1, fd1 = -1;
5666         const char *p, *v = value;
5667         size_t n;
5668
5669         assert(m);
5670         assert(value);
5671         assert(fds);
5672
5673         n = strcspn(v, " ");
5674         id = strndupa(v, n);
5675         if (v[n] != ' ')
5676                 goto finalize;
5677         p = v + n + 1;
5678
5679         v = startswith(p, "tmp-dir=");
5680         if (v) {
5681                 n = strcspn(v, " ");
5682                 tmp_dir = strndupa(v, n);
5683                 if (v[n] != ' ')
5684                         goto finalize;
5685                 p = v + n + 1;
5686         }
5687
5688         v = startswith(p, "var-tmp-dir=");
5689         if (v) {
5690                 n = strcspn(v, " ");
5691                 var_tmp_dir = strndupa(v, n);
5692                 if (v[n] != ' ')
5693                         goto finalize;
5694                 p = v + n + 1;
5695         }
5696
5697         v = startswith(p, "netns-socket-0=");
5698         if (v) {
5699                 char *buf;
5700
5701                 n = strcspn(v, " ");
5702                 buf = strndupa(v, n);
5703                 if (safe_atoi(buf, &fd0) < 0 || !fdset_contains(fds, fd0)) {
5704                         log_debug("Unable to process exec-runtime netns fd specification.");
5705                         return;
5706                 }
5707                 fd0 = fdset_remove(fds, fd0);
5708                 if (v[n] != ' ')
5709                         goto finalize;
5710                 p = v + n + 1;
5711         }
5712
5713         v = startswith(p, "netns-socket-1=");
5714         if (v) {
5715                 char *buf;
5716
5717                 n = strcspn(v, " ");
5718                 buf = strndupa(v, n);
5719                 if (safe_atoi(buf, &fd1) < 0 || !fdset_contains(fds, fd1)) {
5720                         log_debug("Unable to process exec-runtime netns fd specification.");
5721                         return;
5722                 }
5723                 fd1 = fdset_remove(fds, fd1);
5724         }
5725
5726 finalize:
5727
5728         r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, (int[]) { fd0, fd1 }, NULL);
5729         if (r < 0)
5730                 log_debug_errno(r, "Failed to add exec-runtime: %m");
5731 }
5732
5733 void exec_runtime_vacuum(Manager *m) {
5734         ExecRuntime *rt;
5735         Iterator i;
5736
5737         assert(m);
5738
5739         /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
5740
5741         HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
5742                 if (rt->n_ref > 0)
5743                         continue;
5744
5745                 (void) exec_runtime_free(rt, false);
5746         }
5747 }
5748
5749 void exec_params_clear(ExecParameters *p) {
5750         if (!p)
5751                 return;
5752
5753         strv_free(p->environment);
5754 }
5755
5756 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
5757         [EXEC_INPUT_NULL] = "null",
5758         [EXEC_INPUT_TTY] = "tty",
5759         [EXEC_INPUT_TTY_FORCE] = "tty-force",
5760         [EXEC_INPUT_TTY_FAIL] = "tty-fail",
5761         [EXEC_INPUT_SOCKET] = "socket",
5762         [EXEC_INPUT_NAMED_FD] = "fd",
5763         [EXEC_INPUT_DATA] = "data",
5764         [EXEC_INPUT_FILE] = "file",
5765 };
5766
5767 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
5768
5769 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
5770         [EXEC_OUTPUT_INHERIT] = "inherit",
5771         [EXEC_OUTPUT_NULL] = "null",
5772         [EXEC_OUTPUT_TTY] = "tty",
5773         [EXEC_OUTPUT_KMSG] = "kmsg",
5774         [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
5775         [EXEC_OUTPUT_JOURNAL] = "journal",
5776         [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
5777         [EXEC_OUTPUT_SOCKET] = "socket",
5778         [EXEC_OUTPUT_NAMED_FD] = "fd",
5779         [EXEC_OUTPUT_FILE] = "file",
5780         [EXEC_OUTPUT_FILE_APPEND] = "append",
5781 };
5782
5783 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
5784
5785 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
5786         [EXEC_UTMP_INIT] = "init",
5787         [EXEC_UTMP_LOGIN] = "login",
5788         [EXEC_UTMP_USER] = "user",
5789 };
5790
5791 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
5792
5793 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
5794         [EXEC_PRESERVE_NO] = "no",
5795         [EXEC_PRESERVE_YES] = "yes",
5796         [EXEC_PRESERVE_RESTART] = "restart",
5797 };
5798
5799 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
5800
5801 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
5802 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5803         [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
5804         [EXEC_DIRECTORY_STATE] = "StateDirectory",
5805         [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
5806         [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
5807         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
5808 };
5809
5810 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
5811
5812 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
5813  * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
5814  * directories, specifically .timer units with their timestamp touch file. */
5815 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5816         [EXEC_DIRECTORY_RUNTIME] = "runtime",
5817         [EXEC_DIRECTORY_STATE] = "state",
5818         [EXEC_DIRECTORY_CACHE] = "cache",
5819         [EXEC_DIRECTORY_LOGS] = "logs",
5820         [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
5821 };
5822
5823 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
5824
5825 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
5826  * the service payload in. */
5827 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5828         [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
5829         [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
5830         [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
5831         [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
5832         [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
5833 };
5834
5835 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
5836
5837 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
5838         [EXEC_KEYRING_INHERIT] = "inherit",
5839         [EXEC_KEYRING_PRIVATE] = "private",
5840         [EXEC_KEYRING_SHARED] = "shared",
5841 };
5842
5843 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);