src/core/execute.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2
   3 #include <errno.h>
   4 #include <fcntl.h>
   5 #include <poll.h>
   6 #include <sys/eventfd.h>
   7 #include <sys/ioctl.h>
   8 #include <sys/mman.h>
   9 #include <sys/personality.h>
  10 #include <sys/prctl.h>
  11 #include <sys/shm.h>
  12 #include <sys/types.h>
  13 #include <sys/un.h>
  14 #include <unistd.h>
  15 #include <utmpx.h>
  16
  17 #if HAVE_PAM
  18 #include <security/pam_appl.h>
  19 #endif
  20
  21 #if HAVE_SELINUX
  22 #include <selinux/selinux.h>
  23 #endif
  24
  25 #if HAVE_SECCOMP
  26 #include <seccomp.h>
  27 #endif
  28
  29 #if HAVE_APPARMOR
  30 #include <sys/apparmor.h>
  31 #endif
  32
  33 #include "sd-messages.h"
  34
  35 #include "af-list.h"
  36 #include "alloc-util.h"
  37 #if HAVE_APPARMOR
  38 #include "apparmor-util.h"
  39 #endif
  40 #include "async.h"
  41 #include "barrier.h"
  42 #include "cap-list.h"
  43 #include "capability-util.h"
  44 #include "chown-recursive.h"
  45 #include "cgroup-setup.h"
  46 #include "cpu-set-util.h"
  47 #include "def.h"
  48 #include "env-file.h"
  49 #include "env-util.h"
  50 #include "errno-list.h"
  51 #include "execute.h"
  52 #include "exit-status.h"
  53 #include "fd-util.h"
  54 #include "format-util.h"
  55 #include "fs-util.h"
  56 #include "glob-util.h"
  57 #include "hexdecoct.h"
  58 #include "io-util.h"
  59 #include "ioprio.h"
  60 #include "label.h"
  61 #include "log.h"
  62 #include "macro.h"
  63 #include "manager.h"
  64 #include "memory-util.h"
  65 #include "missing_fs.h"
  66 #include "mkdir.h"
  67 #include "namespace.h"
  68 #include "parse-util.h"
  69 #include "path-util.h"
  70 #include "process-util.h"
  71 #include "rlimit-util.h"
  72 #include "rm-rf.h"
  73 #if HAVE_SECCOMP
  74 #include "seccomp-util.h"
  75 #endif
  76 #include "securebits-util.h"
  77 #include "selinux-util.h"
  78 #include "signal-util.h"
  79 #include "smack-util.h"
  80 #include "socket-util.h"
  81 #include "special.h"
  82 #include "stat-util.h"
  83 #include "string-table.h"
  84 #include "string-util.h"
  85 #include "strv.h"
  86 #include "syslog-util.h"
  87 #include "terminal-util.h"
  88 #include "umask-util.h"
  89 #include "unit.h"
  90 #include "user-util.h"
  91 #include "utmp-wtmp.h"
  92
  93 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
  94 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
  95
  96 #define SNDBUF_SIZE (8*1024*1024)
  97
  98 static int shift_fds(int fds[], size_t n_fds) {
  99         int start, restart_from;
 100
 101         if (n_fds <= 0)
 102                 return 0;
 103
 104         /* Modifies the fds array! (sorts it) */
 105
 106         assert(fds);
 107
 108         start = 0;
 109         for (;;) {
 110                 int i;
 111
 112                 restart_from = -1;
 113
 114                 for (i = start; i < (int) n_fds; i++) {
 115                         int nfd;
 116
 117                         /* Already at right index? */
 118                         if (fds[i] == i+3)
 119                                 continue;
 120
 121                         nfd = fcntl(fds[i], F_DUPFD, i + 3);
 122                         if (nfd < 0)
 123                                 return -errno;
 124
 125                         safe_close(fds[i]);
 126                         fds[i] = nfd;
 127
 128                         /* Hmm, the fd we wanted isn't free? Then
 129                          * let's remember that and try again from here */
 130                         if (nfd != i+3 && restart_from < 0)
 131                                 restart_from = i;
 132                 }
 133
 134                 if (restart_from < 0)
 135                         break;
 136
 137                 start = restart_from;
 138         }
 139
 140         return 0;
 141 }
 142
 143 static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
 144         size_t i, n_fds;
 145         int r;
 146
 147         n_fds = n_socket_fds + n_storage_fds;
 148         if (n_fds <= 0)
 149                 return 0;
 150
 151         assert(fds);
 152
 153         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
 154          * O_NONBLOCK only applies to socket activation though. */
 155
 156         for (i = 0; i < n_fds; i++) {
 157
 158                 if (i < n_socket_fds) {
 159                         r = fd_nonblock(fds[i], nonblock);
 160                         if (r < 0)
 161                                 return r;
 162                 }
 163
 164                 /* We unconditionally drop FD_CLOEXEC from the fds,
 165                  * since after all we want to pass these fds to our
 166                  * children */
 167
 168                 r = fd_cloexec(fds[i], false);
 169                 if (r < 0)
 170                         return r;
 171         }
 172
 173         return 0;
 174 }
 175
 176 static const char *exec_context_tty_path(const ExecContext *context) {
 177         assert(context);
 178
 179         if (context->stdio_as_fds)
 180                 return NULL;
 181
 182         if (context->tty_path)
 183                 return context->tty_path;
 184
 185         return "/dev/console";
 186 }
 187
 188 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
 189         const char *path;
 190
 191         assert(context);
 192
 193         path = exec_context_tty_path(context);
 194
 195         if (context->tty_vhangup) {
 196                 if (p && p->stdin_fd >= 0)
 197                         (void) terminal_vhangup_fd(p->stdin_fd);
 198                 else if (path)
 199                         (void) terminal_vhangup(path);
 200         }
 201
 202         if (context->tty_reset) {
 203                 if (p && p->stdin_fd >= 0)
 204                         (void) reset_terminal_fd(p->stdin_fd, true);
 205                 else if (path)
 206                         (void) reset_terminal(path);
 207         }
 208
 209         if (context->tty_vt_disallocate && path)
 210                 (void) vt_disallocate(path);
 211 }
 212
 213 static bool is_terminal_input(ExecInput i) {
 214         return IN_SET(i,
 215                       EXEC_INPUT_TTY,
 216                       EXEC_INPUT_TTY_FORCE,
 217                       EXEC_INPUT_TTY_FAIL);
 218 }
 219
 220 static bool is_terminal_output(ExecOutput o) {
 221         return IN_SET(o,
 222                       EXEC_OUTPUT_TTY,
 223                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 224                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 225 }
 226
 227 static bool is_kmsg_output(ExecOutput o) {
 228         return IN_SET(o,
 229                       EXEC_OUTPUT_KMSG,
 230                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 231 }
 232
 233 static bool exec_context_needs_term(const ExecContext *c) {
 234         assert(c);
 235
 236         /* Return true if the execution context suggests we should set $TERM to something useful. */
 237
 238         if (is_terminal_input(c->std_input))
 239                 return true;
 240
 241         if (is_terminal_output(c->std_output))
 242                 return true;
 243
 244         if (is_terminal_output(c->std_error))
 245                 return true;
 246
 247         return !!c->tty_path;
 248 }
 249
 250 static int open_null_as(int flags, int nfd) {
 251         int fd;
 252
 253         assert(nfd >= 0);
 254
 255         fd = open("/dev/null", flags|O_NOCTTY);
 256         if (fd < 0)
 257                 return -errno;
 258
 259         return move_fd(fd, nfd, false);
 260 }
 261
 262 static int connect_journal_socket(
 263                 int fd,
 264                 const char *log_namespace,
 265                 uid_t uid,
 266                 gid_t gid) {
 267
 268         union sockaddr_union sa;
 269         socklen_t sa_len;
 270         uid_t olduid = UID_INVALID;
 271         gid_t oldgid = GID_INVALID;
 272         const char *j;
 273         int r;
 274
 275         j = log_namespace ?
 276                 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
 277                 "/run/systemd/journal/stdout";
 278         r = sockaddr_un_set_path(&sa.un, j);
 279         if (r < 0)
 280                 return r;
 281         sa_len = r;
 282
 283         if (gid_is_valid(gid)) {
 284                 oldgid = getgid();
 285
 286                 if (setegid(gid) < 0)
 287                         return -errno;
 288         }
 289
 290         if (uid_is_valid(uid)) {
 291                 olduid = getuid();
 292
 293                 if (seteuid(uid) < 0) {
 294                         r = -errno;
 295                         goto restore_gid;
 296                 }
 297         }
 298
 299         r = connect(fd, &sa.sa, sa_len) < 0 ? -errno : 0;
 300
 301         /* If we fail to restore the uid or gid, things will likely
 302            fail later on. This should only happen if an LSM interferes. */
 303
 304         if (uid_is_valid(uid))
 305                 (void) seteuid(olduid);
 306
 307  restore_gid:
 308         if (gid_is_valid(gid))
 309                 (void) setegid(oldgid);
 310
 311         return r;
 312 }
 313
 314 static int connect_logger_as(
 315                 const Unit *unit,
 316                 const ExecContext *context,
 317                 const ExecParameters *params,
 318                 ExecOutput output,
 319                 const char *ident,
 320                 int nfd,
 321                 uid_t uid,
 322                 gid_t gid) {
 323
 324         _cleanup_close_ int fd = -1;
 325         int r;
 326
 327         assert(context);
 328         assert(params);
 329         assert(output < _EXEC_OUTPUT_MAX);
 330         assert(ident);
 331         assert(nfd >= 0);
 332
 333         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 334         if (fd < 0)
 335                 return -errno;
 336
 337         r = connect_journal_socket(fd, context->log_namespace, uid, gid);
 338         if (r < 0)
 339                 return r;
 340
 341         if (shutdown(fd, SHUT_RD) < 0)
 342                 return -errno;
 343
 344         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 345
 346         if (dprintf(fd,
 347                 "%s\n"
 348                 "%s\n"
 349                 "%i\n"
 350                 "%i\n"
 351                 "%i\n"
 352                 "%i\n"
 353                 "%i\n",
 354                 context->syslog_identifier ?: ident,
 355                 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
 356                 context->syslog_priority,
 357                 !!context->syslog_level_prefix,
 358                 false,
 359                 is_kmsg_output(output),
 360                 is_terminal_output(output)) < 0)
 361                 return -errno;
 362
 363         return move_fd(TAKE_FD(fd), nfd, false);
 364 }
 365
 366 static int open_terminal_as(const char *path, int flags, int nfd) {
 367         int fd;
 368
 369         assert(path);
 370         assert(nfd >= 0);
 371
 372         fd = open_terminal(path, flags | O_NOCTTY);
 373         if (fd < 0)
 374                 return fd;
 375
 376         return move_fd(fd, nfd, false);
 377 }
 378
 379 static int acquire_path(const char *path, int flags, mode_t mode) {
 380         union sockaddr_union sa;
 381         socklen_t sa_len;
 382         _cleanup_close_ int fd = -1;
 383         int r;
 384
 385         assert(path);
 386
 387         if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
 388                 flags |= O_CREAT;
 389
 390         fd = open(path, flags|O_NOCTTY, mode);
 391         if (fd >= 0)
 392                 return TAKE_FD(fd);
 393
 394         if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
 395                 return -errno;
 396
 397         /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
 398
 399         r = sockaddr_un_set_path(&sa.un, path);
 400         if (r < 0)
 401                 return r == -EINVAL ? -ENXIO : r;
 402         sa_len = r;
 403
 404         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 405         if (fd < 0)
 406                 return -errno;
 407
 408         if (connect(fd, &sa.sa, sa_len) < 0)
 409                 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
 410                                                            * indication that his wasn't an AF_UNIX socket after all */
 411
 412         if ((flags & O_ACCMODE) == O_RDONLY)
 413                 r = shutdown(fd, SHUT_WR);
 414         else if ((flags & O_ACCMODE) == O_WRONLY)
 415                 r = shutdown(fd, SHUT_RD);
 416         else
 417                 r = 0;
 418         if (r < 0)
 419                 return -errno;
 420
 421         return TAKE_FD(fd);
 422 }
 423
 424 static int fixup_input(
 425                 const ExecContext *context,
 426                 int socket_fd,
 427                 bool apply_tty_stdin) {
 428
 429         ExecInput std_input;
 430
 431         assert(context);
 432
 433         std_input = context->std_input;
 434
 435         if (is_terminal_input(std_input) && !apply_tty_stdin)
 436                 return EXEC_INPUT_NULL;
 437
 438         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 439                 return EXEC_INPUT_NULL;
 440
 441         if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
 442                 return EXEC_INPUT_NULL;
 443
 444         return std_input;
 445 }
 446
 447 static int fixup_output(ExecOutput std_output, int socket_fd) {
 448
 449         if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 450                 return EXEC_OUTPUT_INHERIT;
 451
 452         return std_output;
 453 }
 454
 455 static int setup_input(
 456                 const ExecContext *context,
 457                 const ExecParameters *params,
 458                 int socket_fd,
 459                 const int named_iofds[static 3]) {
 460
 461         ExecInput i;
 462
 463         assert(context);
 464         assert(params);
 465         assert(named_iofds);
 466
 467         if (params->stdin_fd >= 0) {
 468                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 469                         return -errno;
 470
 471                 /* Try to make this the controlling tty, if it is a tty, and reset it */
 472                 if (isatty(STDIN_FILENO)) {
 473                         (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 474                         (void) reset_terminal_fd(STDIN_FILENO, true);
 475                 }
 476
 477                 return STDIN_FILENO;
 478         }
 479
 480         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 481
 482         switch (i) {
 483
 484         case EXEC_INPUT_NULL:
 485                 return open_null_as(O_RDONLY, STDIN_FILENO);
 486
 487         case EXEC_INPUT_TTY:
 488         case EXEC_INPUT_TTY_FORCE:
 489         case EXEC_INPUT_TTY_FAIL: {
 490                 int fd;
 491
 492                 fd = acquire_terminal(exec_context_tty_path(context),
 493                                       i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
 494                                       i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
 495                                                                   ACQUIRE_TERMINAL_WAIT,
 496                                       USEC_INFINITY);
 497                 if (fd < 0)
 498                         return fd;
 499
 500                 return move_fd(fd, STDIN_FILENO, false);
 501         }
 502
 503         case EXEC_INPUT_SOCKET:
 504                 assert(socket_fd >= 0);
 505
 506                 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 507
 508         case EXEC_INPUT_NAMED_FD:
 509                 assert(named_iofds[STDIN_FILENO] >= 0);
 510
 511                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 512                 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 513
 514         case EXEC_INPUT_DATA: {
 515                 int fd;
 516
 517                 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
 518                 if (fd < 0)
 519                         return fd;
 520
 521                 return move_fd(fd, STDIN_FILENO, false);
 522         }
 523
 524         case EXEC_INPUT_FILE: {
 525                 bool rw;
 526                 int fd;
 527
 528                 assert(context->stdio_file[STDIN_FILENO]);
 529
 530                 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
 531                         (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
 532
 533                 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
 534                 if (fd < 0)
 535                         return fd;
 536
 537                 return move_fd(fd, STDIN_FILENO, false);
 538         }
 539
 540         default:
 541                 assert_not_reached("Unknown input type");
 542         }
 543 }
 544
 545 static bool can_inherit_stderr_from_stdout(
 546                 const ExecContext *context,
 547                 ExecOutput o,
 548                 ExecOutput e) {
 549
 550         assert(context);
 551
 552         /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
 553          * stderr fd */
 554
 555         if (e == EXEC_OUTPUT_INHERIT)
 556                 return true;
 557         if (e != o)
 558                 return false;
 559
 560         if (e == EXEC_OUTPUT_NAMED_FD)
 561                 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
 562
 563         if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND))
 564                 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
 565
 566         return true;
 567 }
 568
 569 static int setup_output(
 570                 const Unit *unit,
 571                 const ExecContext *context,
 572                 const ExecParameters *params,
 573                 int fileno,
 574                 int socket_fd,
 575                 const int named_iofds[static 3],
 576                 const char *ident,
 577                 uid_t uid,
 578                 gid_t gid,
 579                 dev_t *journal_stream_dev,
 580                 ino_t *journal_stream_ino) {
 581
 582         ExecOutput o;
 583         ExecInput i;
 584         int r;
 585
 586         assert(unit);
 587         assert(context);
 588         assert(params);
 589         assert(ident);
 590         assert(journal_stream_dev);
 591         assert(journal_stream_ino);
 592
 593         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 594
 595                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 596                         return -errno;
 597
 598                 return STDOUT_FILENO;
 599         }
 600
 601         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 602                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 603                         return -errno;
 604
 605                 return STDERR_FILENO;
 606         }
 607
 608         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 609         o = fixup_output(context->std_output, socket_fd);
 610
 611         if (fileno == STDERR_FILENO) {
 612                 ExecOutput e;
 613                 e = fixup_output(context->std_error, socket_fd);
 614
 615                 /* This expects the input and output are already set up */
 616
 617                 /* Don't change the stderr file descriptor if we inherit all
 618                  * the way and are not on a tty */
 619                 if (e == EXEC_OUTPUT_INHERIT &&
 620                     o == EXEC_OUTPUT_INHERIT &&
 621                     i == EXEC_INPUT_NULL &&
 622                     !is_terminal_input(context->std_input) &&
 623                     getppid () != 1)
 624                         return fileno;
 625
 626                 /* Duplicate from stdout if possible */
 627                 if (can_inherit_stderr_from_stdout(context, o, e))
 628                         return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
 629
 630                 o = e;
 631
 632         } else if (o == EXEC_OUTPUT_INHERIT) {
 633                 /* If input got downgraded, inherit the original value */
 634                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 635                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 636
 637                 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
 638                 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
 639                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 640
 641                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 642                 if (getppid() != 1)
 643                         return fileno;
 644
 645                 /* We need to open /dev/null here anew, to get the right access mode. */
 646                 return open_null_as(O_WRONLY, fileno);
 647         }
 648
 649         switch (o) {
 650
 651         case EXEC_OUTPUT_NULL:
 652                 return open_null_as(O_WRONLY, fileno);
 653
 654         case EXEC_OUTPUT_TTY:
 655                 if (is_terminal_input(i))
 656                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 657
 658                 /* We don't reset the terminal if this is just about output */
 659                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 660
 661         case EXEC_OUTPUT_KMSG:
 662         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 663         case EXEC_OUTPUT_JOURNAL:
 664         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 665                 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
 666                 if (r < 0) {
 667                         log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
 668                         r = open_null_as(O_WRONLY, fileno);
 669                 } else {
 670                         struct stat st;
 671
 672                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 673                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 674                          * services to detect whether they are connected to the journal or not.
 675                          *
 676                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
 677                          * about STDERR as that's usually the best way to do logging. */
 678
 679                         if (fstat(fileno, &st) >= 0 &&
 680                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
 681                                 *journal_stream_dev = st.st_dev;
 682                                 *journal_stream_ino = st.st_ino;
 683                         }
 684                 }
 685                 return r;
 686
 687         case EXEC_OUTPUT_SOCKET:
 688                 assert(socket_fd >= 0);
 689
 690                 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
 691
 692         case EXEC_OUTPUT_NAMED_FD:
 693                 assert(named_iofds[fileno] >= 0);
 694
 695                 (void) fd_nonblock(named_iofds[fileno], false);
 696                 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
 697
 698         case EXEC_OUTPUT_FILE:
 699         case EXEC_OUTPUT_FILE_APPEND: {
 700                 bool rw;
 701                 int fd, flags;
 702
 703                 assert(context->stdio_file[fileno]);
 704
 705                 rw = context->std_input == EXEC_INPUT_FILE &&
 706                         streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
 707
 708                 if (rw)
 709                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 710
 711                 flags = O_WRONLY;
 712                 if (o == EXEC_OUTPUT_FILE_APPEND)
 713                         flags |= O_APPEND;
 714
 715                 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
 716                 if (fd < 0)
 717                         return fd;
 718
 719                 return move_fd(fd, fileno, 0);
 720         }
 721
 722         default:
 723                 assert_not_reached("Unknown error type");
 724         }
 725 }
 726
 727 static int chown_terminal(int fd, uid_t uid) {
 728         int r;
 729
 730         assert(fd >= 0);
 731
 732         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 733         if (isatty(fd) < 1) {
 734                 if (IN_SET(errno, EINVAL, ENOTTY))
 735                         return 0; /* not a tty */
 736
 737                 return -errno;
 738         }
 739
 740         /* This might fail. What matters are the results. */
 741         r = fchmod_and_chown(fd, TTY_MODE, uid, -1);
 742         if (r < 0)
 743                 return r;
 744
 745         return 1;
 746 }
 747
 748 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
 749         _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
 750         int r;
 751
 752         assert(_saved_stdin);
 753         assert(_saved_stdout);
 754
 755         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 756         if (saved_stdin < 0)
 757                 return -errno;
 758
 759         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
 760         if (saved_stdout < 0)
 761                 return -errno;
 762
 763         fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
 764         if (fd < 0)
 765                 return fd;
 766
 767         r = chown_terminal(fd, getuid());
 768         if (r < 0)
 769                 return r;
 770
 771         r = reset_terminal_fd(fd, true);
 772         if (r < 0)
 773                 return r;
 774
 775         r = rearrange_stdio(fd, fd, STDERR_FILENO);
 776         fd = -1;
 777         if (r < 0)
 778                 return r;
 779
 780         *_saved_stdin = saved_stdin;
 781         *_saved_stdout = saved_stdout;
 782
 783         saved_stdin = saved_stdout = -1;
 784
 785         return 0;
 786 }
 787
 788 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
 789         assert(err < 0);
 790
 791         if (err == -ETIMEDOUT)
 792                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
 793         else {
 794                 errno = -err;
 795                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
 796         }
 797 }
 798
 799 static void write_confirm_error(int err, const char *vc, const Unit *u) {
 800         _cleanup_close_ int fd = -1;
 801
 802         assert(vc);
 803
 804         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 805         if (fd < 0)
 806                 return;
 807
 808         write_confirm_error_fd(err, fd, u);
 809 }
 810
 811 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 812         int r = 0;
 813
 814         assert(saved_stdin);
 815         assert(saved_stdout);
 816
 817         release_terminal();
 818
 819         if (*saved_stdin >= 0)
 820                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 821                         r = -errno;
 822
 823         if (*saved_stdout >= 0)
 824                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 825                         r = -errno;
 826
 827         *saved_stdin = safe_close(*saved_stdin);
 828         *saved_stdout = safe_close(*saved_stdout);
 829
 830         return r;
 831 }
 832
 833 enum {
 834         CONFIRM_PRETEND_FAILURE = -1,
 835         CONFIRM_PRETEND_SUCCESS =  0,
 836         CONFIRM_EXECUTE = 1,
 837 };
 838
 839 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
 840         int saved_stdout = -1, saved_stdin = -1, r;
 841         _cleanup_free_ char *e = NULL;
 842         char c;
 843
 844         /* For any internal errors, assume a positive response. */
 845         r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
 846         if (r < 0) {
 847                 write_confirm_error(r, vc, u);
 848                 return CONFIRM_EXECUTE;
 849         }
 850
 851         /* confirm_spawn might have been disabled while we were sleeping. */
 852         if (manager_is_confirm_spawn_disabled(u->manager)) {
 853                 r = 1;
 854                 goto restore_stdio;
 855         }
 856
 857         e = ellipsize(cmdline, 60, 100);
 858         if (!e) {
 859                 log_oom();
 860                 r = CONFIRM_EXECUTE;
 861                 goto restore_stdio;
 862         }
 863
 864         for (;;) {
 865                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 866                 if (r < 0) {
 867                         write_confirm_error_fd(r, STDOUT_FILENO, u);
 868                         r = CONFIRM_EXECUTE;
 869                         goto restore_stdio;
 870                 }
 871
 872                 switch (c) {
 873                 case 'c':
 874                         printf("Resuming normal execution.\n");
 875                         manager_disable_confirm_spawn();
 876                         r = 1;
 877                         break;
 878                 case 'D':
 879                         unit_dump(u, stdout, "  ");
 880                         continue; /* ask again */
 881                 case 'f':
 882                         printf("Failing execution.\n");
 883                         r = CONFIRM_PRETEND_FAILURE;
 884                         break;
 885                 case 'h':
 886                         printf("  c - continue, proceed without asking anymore\n"
 887                                "  D - dump, show the state of the unit\n"
 888                                "  f - fail, don't execute the command and pretend it failed\n"
 889                                "  h - help\n"
 890                                "  i - info, show a short summary of the unit\n"
 891                                "  j - jobs, show jobs that are in progress\n"
 892                                "  s - skip, don't execute the command and pretend it succeeded\n"
 893                                "  y - yes, execute the command\n");
 894                         continue; /* ask again */
 895                 case 'i':
 896                         printf("  Description: %s\n"
 897                                "  Unit:        %s\n"
 898                                "  Command:     %s\n",
 899                                u->id, u->description, cmdline);
 900                         continue; /* ask again */
 901                 case 'j':
 902                         manager_dump_jobs(u->manager, stdout, "  ");
 903                         continue; /* ask again */
 904                 case 'n':
 905                         /* 'n' was removed in favor of 'f'. */
 906                         printf("Didn't understand 'n', did you mean 'f'?\n");
 907                         continue; /* ask again */
 908                 case 's':
 909                         printf("Skipping execution.\n");
 910                         r = CONFIRM_PRETEND_SUCCESS;
 911                         break;
 912                 case 'y':
 913                         r = CONFIRM_EXECUTE;
 914                         break;
 915                 default:
 916                         assert_not_reached("Unhandled choice");
 917                 }
 918                 break;
 919         }
 920
 921 restore_stdio:
 922         restore_confirm_stdio(&saved_stdin, &saved_stdout);
 923         return r;
 924 }
 925
 926 static int get_fixed_user(const ExecContext *c, const char **user,
 927                           uid_t *uid, gid_t *gid,
 928                           const char **home, const char **shell) {
 929         int r;
 930         const char *name;
 931
 932         assert(c);
 933
 934         if (!c->user)
 935                 return 0;
 936
 937         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
 938          * (i.e. are "/" or "/bin/nologin"). */
 939
 940         name = c->user;
 941         r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
 942         if (r < 0)
 943                 return r;
 944
 945         *user = name;
 946         return 0;
 947 }
 948
 949 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
 950         int r;
 951         const char *name;
 952
 953         assert(c);
 954
 955         if (!c->group)
 956                 return 0;
 957
 958         name = c->group;
 959         r = get_group_creds(&name, gid, 0);
 960         if (r < 0)
 961                 return r;
 962
 963         *group = name;
 964         return 0;
 965 }
 966
 967 static int get_supplementary_groups(const ExecContext *c, const char *user,
 968                                     const char *group, gid_t gid,
 969                                     gid_t **supplementary_gids, int *ngids) {
 970         char **i;
 971         int r, k = 0;
 972         int ngroups_max;
 973         bool keep_groups = false;
 974         gid_t *groups = NULL;
 975         _cleanup_free_ gid_t *l_gids = NULL;
 976
 977         assert(c);
 978
 979         /*
 980          * If user is given, then lookup GID and supplementary groups list.
 981          * We avoid NSS lookups for gid=0. Also we have to initialize groups
 982          * here and as early as possible so we keep the list of supplementary
 983          * groups of the caller.
 984          */
 985         if (user && gid_is_valid(gid) && gid != 0) {
 986                 /* First step, initialize groups from /etc/groups */
 987                 if (initgroups(user, gid) < 0)
 988                         return -errno;
 989
 990                 keep_groups = true;
 991         }
 992
 993         if (strv_isempty(c->supplementary_groups))
 994                 return 0;
 995
 996         /*
 997          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
 998          * be positive, otherwise fail.
 999          */
1000         errno = 0;
1001         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1002         if (ngroups_max <= 0)
1003                 return errno_or_else(EOPNOTSUPP);
1004
1005         l_gids = new(gid_t, ngroups_max);
1006         if (!l_gids)
1007                 return -ENOMEM;
1008
1009         if (keep_groups) {
1010                 /*
1011                  * Lookup the list of groups that the user belongs to, we
1012                  * avoid NSS lookups here too for gid=0.
1013                  */
1014                 k = ngroups_max;
1015                 if (getgrouplist(user, gid, l_gids, &k) < 0)
1016                         return -EINVAL;
1017         } else
1018                 k = 0;
1019
1020         STRV_FOREACH(i, c->supplementary_groups) {
1021                 const char *g;
1022
1023                 if (k >= ngroups_max)
1024                         return -E2BIG;
1025
1026                 g = *i;
1027                 r = get_group_creds(&g, l_gids+k, 0);
1028                 if (r < 0)
1029                         return r;
1030
1031                 k++;
1032         }
1033
1034         /*
1035          * Sets ngids to zero to drop all supplementary groups, happens
1036          * when we are under root and SupplementaryGroups= is empty.
1037          */
1038         if (k == 0) {
1039                 *ngids = 0;
1040                 return 0;
1041         }
1042
1043         /* Otherwise get the final list of supplementary groups */
1044         groups = memdup(l_gids, sizeof(gid_t) * k);
1045         if (!groups)
1046                 return -ENOMEM;
1047
1048         *supplementary_gids = groups;
1049         *ngids = k;
1050
1051         groups = NULL;
1052
1053         return 0;
1054 }
1055
1056 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1057         int r;
1058
1059         /* Handle SupplementaryGroups= if it is not empty */
1060         if (ngids > 0) {
1061                 r = maybe_setgroups(ngids, supplementary_gids);
1062                 if (r < 0)
1063                         return r;
1064         }
1065
1066         if (gid_is_valid(gid)) {
1067                 /* Then set our gids */
1068                 if (setresgid(gid, gid, gid) < 0)
1069                         return -errno;
1070         }
1071
1072         return 0;
1073 }
1074
1075 static int enforce_user(const ExecContext *context, uid_t uid) {
1076         assert(context);
1077
1078         if (!uid_is_valid(uid))
1079                 return 0;
1080
1081         /* Sets (but doesn't look up) the uid and make sure we keep the
1082          * capabilities while doing so. */
1083
1084         if (context->capability_ambient_set != 0) {
1085
1086                 /* First step: If we need to keep capabilities but
1087                  * drop privileges we need to make sure we keep our
1088                  * caps, while we drop privileges. */
1089                 if (uid != 0) {
1090                         int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
1091
1092                         if (prctl(PR_GET_SECUREBITS) != sb)
1093                                 if (prctl(PR_SET_SECUREBITS, sb) < 0)
1094                                         return -errno;
1095                 }
1096         }
1097
1098         /* Second step: actually set the uids */
1099         if (setresuid(uid, uid, uid) < 0)
1100                 return -errno;
1101
1102         /* At this point we should have all necessary capabilities but
1103            are otherwise a normal user. However, the caps might got
1104            corrupted due to the setresuid() so we need clean them up
1105            later. This is done outside of this call. */
1106
1107         return 0;
1108 }
1109
1110 #if HAVE_PAM
1111
1112 static int null_conv(
1113                 int num_msg,
1114                 const struct pam_message **msg,
1115                 struct pam_response **resp,
1116                 void *appdata_ptr) {
1117
1118         /* We don't support conversations */
1119
1120         return PAM_CONV_ERR;
1121 }
1122
1123 #endif
1124
1125 static int setup_pam(
1126                 const char *name,
1127                 const char *user,
1128                 uid_t uid,
1129                 gid_t gid,
1130                 const char *tty,
1131                 char ***env,
1132                 const int fds[], size_t n_fds) {
1133
1134 #if HAVE_PAM
1135
1136         static const struct pam_conv conv = {
1137                 .conv = null_conv,
1138                 .appdata_ptr = NULL
1139         };
1140
1141         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1142         pam_handle_t *handle = NULL;
1143         sigset_t old_ss;
1144         int pam_code = PAM_SUCCESS, r;
1145         char **nv, **e = NULL;
1146         bool close_session = false;
1147         pid_t pam_pid = 0, parent_pid;
1148         int flags = 0;
1149
1150         assert(name);
1151         assert(user);
1152         assert(env);
1153
1154         /* We set up PAM in the parent process, then fork. The child
1155          * will then stay around until killed via PR_GET_PDEATHSIG or
1156          * systemd via the cgroup logic. It will then remove the PAM
1157          * session again. The parent process will exec() the actual
1158          * daemon. We do things this way to ensure that the main PID
1159          * of the daemon is the one we initially fork()ed. */
1160
1161         r = barrier_create(&barrier);
1162         if (r < 0)
1163                 goto fail;
1164
1165         if (log_get_max_level() < LOG_DEBUG)
1166                 flags |= PAM_SILENT;
1167
1168         pam_code = pam_start(name, user, &conv, &handle);
1169         if (pam_code != PAM_SUCCESS) {
1170                 handle = NULL;
1171                 goto fail;
1172         }
1173
1174         if (!tty) {
1175                 _cleanup_free_ char *q = NULL;
1176
1177                 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1178                  * out if that's the case, and read the TTY off it. */
1179
1180                 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1181                         tty = strjoina("/dev/", q);
1182         }
1183
1184         if (tty) {
1185                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1186                 if (pam_code != PAM_SUCCESS)
1187                         goto fail;
1188         }
1189
1190         STRV_FOREACH(nv, *env) {
1191                 pam_code = pam_putenv(handle, *nv);
1192                 if (pam_code != PAM_SUCCESS)
1193                         goto fail;
1194         }
1195
1196         pam_code = pam_acct_mgmt(handle, flags);
1197         if (pam_code != PAM_SUCCESS)
1198                 goto fail;
1199
1200         pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1201         if (pam_code != PAM_SUCCESS)
1202                 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
1203
1204         pam_code = pam_open_session(handle, flags);
1205         if (pam_code != PAM_SUCCESS)
1206                 goto fail;
1207
1208         close_session = true;
1209
1210         e = pam_getenvlist(handle);
1211         if (!e) {
1212                 pam_code = PAM_BUF_ERR;
1213                 goto fail;
1214         }
1215
1216         /* Block SIGTERM, so that we know that it won't get lost in
1217          * the child */
1218
1219         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1220
1221         parent_pid = getpid_cached();
1222
1223         r = safe_fork("(sd-pam)", 0, &pam_pid);
1224         if (r < 0)
1225                 goto fail;
1226         if (r == 0) {
1227                 int sig, ret = EXIT_PAM;
1228
1229                 /* The child's job is to reset the PAM session on
1230                  * termination */
1231                 barrier_set_role(&barrier, BARRIER_CHILD);
1232
1233                 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only those fds
1234                  * are open here that have been opened by PAM. */
1235                 (void) close_many(fds, n_fds);
1236
1237                 /* Drop privileges - we don't need any to pam_close_session
1238                  * and this will make PR_SET_PDEATHSIG work in most cases.
1239                  * If this fails, ignore the error - but expect sd-pam threads
1240                  * to fail to exit normally */
1241
1242                 r = maybe_setgroups(0, NULL);
1243                 if (r < 0)
1244                         log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1245                 if (setresgid(gid, gid, gid) < 0)
1246                         log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1247                 if (setresuid(uid, uid, uid) < 0)
1248                         log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1249
1250                 (void) ignore_signals(SIGPIPE, -1);
1251
1252                 /* Wait until our parent died. This will only work if
1253                  * the above setresuid() succeeds, otherwise the kernel
1254                  * will not allow unprivileged parents kill their privileged
1255                  * children this way. We rely on the control groups kill logic
1256                  * to do the rest for us. */
1257                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1258                         goto child_finish;
1259
1260                 /* Tell the parent that our setup is done. This is especially
1261                  * important regarding dropping privileges. Otherwise, unit
1262                  * setup might race against our setresuid(2) call.
1263                  *
1264                  * If the parent aborted, we'll detect this below, hence ignore
1265                  * return failure here. */
1266                 (void) barrier_place(&barrier);
1267
1268                 /* Check if our parent process might already have died? */
1269                 if (getppid() == parent_pid) {
1270                         sigset_t ss;
1271
1272                         assert_se(sigemptyset(&ss) >= 0);
1273                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1274
1275                         for (;;) {
1276                                 if (sigwait(&ss, &sig) < 0) {
1277                                         if (errno == EINTR)
1278                                                 continue;
1279
1280                                         goto child_finish;
1281                                 }
1282
1283                                 assert(sig == SIGTERM);
1284                                 break;
1285                         }
1286                 }
1287
1288                 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1289                 if (pam_code != PAM_SUCCESS)
1290                         goto child_finish;
1291
1292                 /* If our parent died we'll end the session */
1293                 if (getppid() != parent_pid) {
1294                         pam_code = pam_close_session(handle, flags);
1295                         if (pam_code != PAM_SUCCESS)
1296                                 goto child_finish;
1297                 }
1298
1299                 ret = 0;
1300
1301         child_finish:
1302                 pam_end(handle, pam_code | flags);
1303                 _exit(ret);
1304         }
1305
1306         barrier_set_role(&barrier, BARRIER_PARENT);
1307
1308         /* If the child was forked off successfully it will do all the
1309          * cleanups, so forget about the handle here. */
1310         handle = NULL;
1311
1312         /* Unblock SIGTERM again in the parent */
1313         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1314
1315         /* We close the log explicitly here, since the PAM modules
1316          * might have opened it, but we don't want this fd around. */
1317         closelog();
1318
1319         /* Synchronously wait for the child to initialize. We don't care for
1320          * errors as we cannot recover. However, warn loudly if it happens. */
1321         if (!barrier_place_and_sync(&barrier))
1322                 log_error("PAM initialization failed");
1323
1324         return strv_free_and_replace(*env, e);
1325
1326 fail:
1327         if (pam_code != PAM_SUCCESS) {
1328                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1329                 r = -EPERM;  /* PAM errors do not map to errno */
1330         } else
1331                 log_error_errno(r, "PAM failed: %m");
1332
1333         if (handle) {
1334                 if (close_session)
1335                         pam_code = pam_close_session(handle, flags);
1336
1337                 pam_end(handle, pam_code | flags);
1338         }
1339
1340         strv_free(e);
1341         closelog();
1342
1343         return r;
1344 #else
1345         return 0;
1346 #endif
1347 }
1348
1349 static void rename_process_from_path(const char *path) {
1350         char process_name[11];
1351         const char *p;
1352         size_t l;
1353
1354         /* This resulting string must fit in 10 chars (i.e. the length
1355          * of "/sbin/init") to look pretty in /bin/ps */
1356
1357         p = basename(path);
1358         if (isempty(p)) {
1359                 rename_process("(...)");
1360                 return;
1361         }
1362
1363         l = strlen(p);
1364         if (l > 8) {
1365                 /* The end of the process name is usually more
1366                  * interesting, since the first bit might just be
1367                  * "systemd-" */
1368                 p = p + l - 8;
1369                 l = 8;
1370         }
1371
1372         process_name[0] = '(';
1373         memcpy(process_name+1, p, l);
1374         process_name[1+l] = ')';
1375         process_name[1+l+1] = 0;
1376
1377         rename_process(process_name);
1378 }
1379
1380 static bool context_has_address_families(const ExecContext *c) {
1381         assert(c);
1382
1383         return c->address_families_allow_list ||
1384                 !set_isempty(c->address_families);
1385 }
1386
1387 static bool context_has_syscall_filters(const ExecContext *c) {
1388         assert(c);
1389
1390         return c->syscall_allow_list ||
1391                 !hashmap_isempty(c->syscall_filter);
1392 }
1393
1394 static bool context_has_no_new_privileges(const ExecContext *c) {
1395         assert(c);
1396
1397         if (c->no_new_privileges)
1398                 return true;
1399
1400         if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1401                 return false;
1402
1403         /* We need NNP if we have any form of seccomp and are unprivileged */
1404         return context_has_address_families(c) ||
1405                 c->memory_deny_write_execute ||
1406                 c->restrict_realtime ||
1407                 c->restrict_suid_sgid ||
1408                 exec_context_restrict_namespaces_set(c) ||
1409                 c->protect_clock ||
1410                 c->protect_kernel_tunables ||
1411                 c->protect_kernel_modules ||
1412                 c->protect_kernel_logs ||
1413                 c->private_devices ||
1414                 context_has_syscall_filters(c) ||
1415                 !set_isempty(c->syscall_archs) ||
1416                 c->lock_personality ||
1417                 c->protect_hostname;
1418 }
1419
1420 #if HAVE_SECCOMP
1421
1422 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1423
1424         if (is_seccomp_available())
1425                 return false;
1426
1427         log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1428         return true;
1429 }
1430
1431 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1432         uint32_t negative_action, default_action, action;
1433         int r;
1434
1435         assert(u);
1436         assert(c);
1437
1438         if (!context_has_syscall_filters(c))
1439                 return 0;
1440
1441         if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1442                 return 0;
1443
1444         negative_action = c->syscall_errno == 0 ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1445
1446         if (c->syscall_allow_list) {
1447                 default_action = negative_action;
1448                 action = SCMP_ACT_ALLOW;
1449         } else {
1450                 default_action = SCMP_ACT_ALLOW;
1451                 action = negative_action;
1452         }
1453
1454         if (needs_ambient_hack) {
1455                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1456                 if (r < 0)
1457                         return r;
1458         }
1459
1460         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1461 }
1462
1463 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1464         assert(u);
1465         assert(c);
1466
1467         if (set_isempty(c->syscall_archs))
1468                 return 0;
1469
1470         if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1471                 return 0;
1472
1473         return seccomp_restrict_archs(c->syscall_archs);
1474 }
1475
1476 static int apply_address_families(const Unit* u, const ExecContext *c) {
1477         assert(u);
1478         assert(c);
1479
1480         if (!context_has_address_families(c))
1481                 return 0;
1482
1483         if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1484                 return 0;
1485
1486         return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1487 }
1488
1489 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1490         assert(u);
1491         assert(c);
1492
1493         if (!c->memory_deny_write_execute)
1494                 return 0;
1495
1496         if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1497                 return 0;
1498
1499         return seccomp_memory_deny_write_execute();
1500 }
1501
1502 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1503         assert(u);
1504         assert(c);
1505
1506         if (!c->restrict_realtime)
1507                 return 0;
1508
1509         if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1510                 return 0;
1511
1512         return seccomp_restrict_realtime();
1513 }
1514
1515 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1516         assert(u);
1517         assert(c);
1518
1519         if (!c->restrict_suid_sgid)
1520                 return 0;
1521
1522         if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1523                 return 0;
1524
1525         return seccomp_restrict_suid_sgid();
1526 }
1527
1528 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1529         assert(u);
1530         assert(c);
1531
1532         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1533          * let's protect even those systems where this is left on in the kernel. */
1534
1535         if (!c->protect_kernel_tunables)
1536                 return 0;
1537
1538         if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1539                 return 0;
1540
1541         return seccomp_protect_sysctl();
1542 }
1543
1544 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1545         assert(u);
1546         assert(c);
1547
1548         /* Turn off module syscalls on ProtectKernelModules=yes */
1549
1550         if (!c->protect_kernel_modules)
1551                 return 0;
1552
1553         if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1554                 return 0;
1555
1556         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1557 }
1558
1559 static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1560         assert(u);
1561         assert(c);
1562
1563         if (!c->protect_kernel_logs)
1564                 return 0;
1565
1566         if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1567                 return 0;
1568
1569         return seccomp_protect_syslog();
1570 }
1571
1572 static int apply_protect_clock(const Unit *u, const ExecContext *c) {
1573         assert(u);
1574         assert(c);
1575
1576         if (!c->protect_clock)
1577                 return 0;
1578
1579         if (skip_seccomp_unavailable(u, "ProtectClock="))
1580                 return 0;
1581
1582         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1583 }
1584
1585 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1586         assert(u);
1587         assert(c);
1588
1589         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1590
1591         if (!c->private_devices)
1592                 return 0;
1593
1594         if (skip_seccomp_unavailable(u, "PrivateDevices="))
1595                 return 0;
1596
1597         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1598 }
1599
1600 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1601         assert(u);
1602         assert(c);
1603
1604         if (!exec_context_restrict_namespaces_set(c))
1605                 return 0;
1606
1607         if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1608                 return 0;
1609
1610         return seccomp_restrict_namespaces(c->restrict_namespaces);
1611 }
1612
1613 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1614         unsigned long personality;
1615         int r;
1616
1617         assert(u);
1618         assert(c);
1619
1620         if (!c->lock_personality)
1621                 return 0;
1622
1623         if (skip_seccomp_unavailable(u, "LockPersonality="))
1624                 return 0;
1625
1626         personality = c->personality;
1627
1628         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1629         if (personality == PERSONALITY_INVALID) {
1630
1631                 r = opinionated_personality(&personality);
1632                 if (r < 0)
1633                         return r;
1634         }
1635
1636         return seccomp_lock_personality(personality);
1637 }
1638
1639 #endif
1640
1641 static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
1642         assert(u);
1643         assert(c);
1644
1645         if (!c->protect_hostname)
1646                 return 0;
1647
1648         if (ns_type_supported(NAMESPACE_UTS)) {
1649                 if (unshare(CLONE_NEWUTS) < 0) {
1650                         if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1651                                 *ret_exit_status = EXIT_NAMESPACE;
1652                                 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1653                         }
1654
1655                         log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1656                 }
1657         } else
1658                 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1659
1660 #if HAVE_SECCOMP
1661         int r;
1662
1663         if (skip_seccomp_unavailable(u, "ProtectHostname="))
1664                 return 0;
1665
1666         r = seccomp_protect_hostname();
1667         if (r < 0) {
1668                 *ret_exit_status = EXIT_SECCOMP;
1669                 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1670         }
1671 #endif
1672
1673         return 0;
1674 }
1675
1676 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1677         assert(idle_pipe);
1678
1679         idle_pipe[1] = safe_close(idle_pipe[1]);
1680         idle_pipe[2] = safe_close(idle_pipe[2]);
1681
1682         if (idle_pipe[0] >= 0) {
1683                 int r;
1684
1685                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1686
1687                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1688                         ssize_t n;
1689
1690                         /* Signal systemd that we are bored and want to continue. */
1691                         n = write(idle_pipe[3], "x", 1);
1692                         if (n > 0)
1693                                 /* Wait for systemd to react to the signal above. */
1694                                 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1695                 }
1696
1697                 idle_pipe[0] = safe_close(idle_pipe[0]);
1698
1699         }
1700
1701         idle_pipe[3] = safe_close(idle_pipe[3]);
1702 }
1703
1704 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1705
1706 static int build_environment(
1707                 const Unit *u,
1708                 const ExecContext *c,
1709                 const ExecParameters *p,
1710                 size_t n_fds,
1711                 const char *home,
1712                 const char *username,
1713                 const char *shell,
1714                 dev_t journal_stream_dev,
1715                 ino_t journal_stream_ino,
1716                 char ***ret) {
1717
1718         _cleanup_strv_free_ char **our_env = NULL;
1719         ExecDirectoryType t;
1720         size_t n_env = 0;
1721         char *x;
1722
1723         assert(u);
1724         assert(c);
1725         assert(p);
1726         assert(ret);
1727
1728 #define N_ENV_VARS 15
1729         our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1730         if (!our_env)
1731                 return -ENOMEM;
1732
1733         if (n_fds > 0) {
1734                 _cleanup_free_ char *joined = NULL;
1735
1736                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1737                         return -ENOMEM;
1738                 our_env[n_env++] = x;
1739
1740                 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1741                         return -ENOMEM;
1742                 our_env[n_env++] = x;
1743
1744                 joined = strv_join(p->fd_names, ":");
1745                 if (!joined)
1746                         return -ENOMEM;
1747
1748                 x = strjoin("LISTEN_FDNAMES=", joined);
1749                 if (!x)
1750                         return -ENOMEM;
1751                 our_env[n_env++] = x;
1752         }
1753
1754         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1755                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1756                         return -ENOMEM;
1757                 our_env[n_env++] = x;
1758
1759                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1760                         return -ENOMEM;
1761                 our_env[n_env++] = x;
1762         }
1763
1764         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1765          * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1766          * check the database directly. */
1767         if (p->flags & EXEC_NSS_BYPASS_BUS) {
1768                 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1769                 if (!x)
1770                         return -ENOMEM;
1771                 our_env[n_env++] = x;
1772         }
1773
1774         if (home) {
1775                 x = strjoin("HOME=", home);
1776                 if (!x)
1777                         return -ENOMEM;
1778
1779                 path_simplify(x + 5, true);
1780                 our_env[n_env++] = x;
1781         }
1782
1783         if (username) {
1784                 x = strjoin("LOGNAME=", username);
1785                 if (!x)
1786                         return -ENOMEM;
1787                 our_env[n_env++] = x;
1788
1789                 x = strjoin("USER=", username);
1790                 if (!x)
1791                         return -ENOMEM;
1792                 our_env[n_env++] = x;
1793         }
1794
1795         if (shell) {
1796                 x = strjoin("SHELL=", shell);
1797                 if (!x)
1798                         return -ENOMEM;
1799
1800                 path_simplify(x + 6, true);
1801                 our_env[n_env++] = x;
1802         }
1803
1804         if (!sd_id128_is_null(u->invocation_id)) {
1805                 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1806                         return -ENOMEM;
1807
1808                 our_env[n_env++] = x;
1809         }
1810
1811         if (exec_context_needs_term(c)) {
1812                 const char *tty_path, *term = NULL;
1813
1814                 tty_path = exec_context_tty_path(c);
1815
1816                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1817                  * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1818                  * container manager passes to PID 1 ends up all the way in the console login shown. */
1819
1820                 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
1821                         term = getenv("TERM");
1822
1823                 if (!term)
1824                         term = default_term_for_tty(tty_path);
1825
1826                 x = strjoin("TERM=", term);
1827                 if (!x)
1828                         return -ENOMEM;
1829                 our_env[n_env++] = x;
1830         }
1831
1832         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1833                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1834                         return -ENOMEM;
1835
1836                 our_env[n_env++] = x;
1837         }
1838
1839         if (c->log_namespace) {
1840                 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1841                 if (!x)
1842                         return -ENOMEM;
1843
1844                 our_env[n_env++] = x;
1845         }
1846
1847         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1848                 _cleanup_free_ char *pre = NULL, *joined = NULL;
1849                 const char *n;
1850
1851                 if (!p->prefix[t])
1852                         continue;
1853
1854                 if (strv_isempty(c->directories[t].paths))
1855                         continue;
1856
1857                 n = exec_directory_env_name_to_string(t);
1858                 if (!n)
1859                         continue;
1860
1861                 pre = strjoin(p->prefix[t], "/");
1862                 if (!pre)
1863                         return -ENOMEM;
1864
1865                 joined = strv_join_prefix(c->directories[t].paths, ":", pre);
1866                 if (!joined)
1867                         return -ENOMEM;
1868
1869                 x = strjoin(n, "=", joined);
1870                 if (!x)
1871                         return -ENOMEM;
1872
1873                 our_env[n_env++] = x;
1874         }
1875
1876         our_env[n_env++] = NULL;
1877         assert(n_env <= N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1878 #undef N_ENV_VARS
1879
1880         *ret = TAKE_PTR(our_env);
1881
1882         return 0;
1883 }
1884
1885 static int build_pass_environment(const ExecContext *c, char ***ret) {
1886         _cleanup_strv_free_ char **pass_env = NULL;
1887         size_t n_env = 0, n_bufsize = 0;
1888         char **i;
1889
1890         STRV_FOREACH(i, c->pass_environment) {
1891                 _cleanup_free_ char *x = NULL;
1892                 char *v;
1893
1894                 v = getenv(*i);
1895                 if (!v)
1896                         continue;
1897                 x = strjoin(*i, "=", v);
1898                 if (!x)
1899                         return -ENOMEM;
1900
1901                 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1902                         return -ENOMEM;
1903
1904                 pass_env[n_env++] = TAKE_PTR(x);
1905                 pass_env[n_env] = NULL;
1906         }
1907
1908         *ret = TAKE_PTR(pass_env);
1909
1910         return 0;
1911 }
1912
1913 static bool exec_needs_mount_namespace(
1914                 const ExecContext *context,
1915                 const ExecParameters *params,
1916                 const ExecRuntime *runtime) {
1917
1918         assert(context);
1919         assert(params);
1920
1921         if (context->root_image)
1922                 return true;
1923
1924         if (!strv_isempty(context->read_write_paths) ||
1925             !strv_isempty(context->read_only_paths) ||
1926             !strv_isempty(context->inaccessible_paths))
1927                 return true;
1928
1929         if (context->n_bind_mounts > 0)
1930                 return true;
1931
1932         if (context->n_temporary_filesystems > 0)
1933                 return true;
1934
1935         if (!IN_SET(context->mount_flags, 0, MS_SHARED))
1936                 return true;
1937
1938         if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
1939                 return true;
1940
1941         if (context->private_devices ||
1942             context->private_mounts ||
1943             context->protect_system != PROTECT_SYSTEM_NO ||
1944             context->protect_home != PROTECT_HOME_NO ||
1945             context->protect_kernel_tunables ||
1946             context->protect_kernel_modules ||
1947             context->protect_kernel_logs ||
1948             context->protect_control_groups)
1949                 return true;
1950
1951         if (context->root_directory) {
1952                 ExecDirectoryType t;
1953
1954                 if (context->mount_apivfs)
1955                         return true;
1956
1957                 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1958                         if (!params->prefix[t])
1959                                 continue;
1960
1961                         if (!strv_isempty(context->directories[t].paths))
1962                                 return true;
1963                 }
1964         }
1965
1966         if (context->dynamic_user &&
1967             (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
1968              !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
1969              !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
1970                 return true;
1971
1972         if (context->log_namespace)
1973                 return true;
1974
1975         return false;
1976 }
1977
1978 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
1979         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1980         _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
1981         _cleanup_close_ int unshare_ready_fd = -1;
1982         _cleanup_(sigkill_waitp) pid_t pid = 0;
1983         uint64_t c = 1;
1984         ssize_t n;
1985         int r;
1986
1987         /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
1988          * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
1989          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1990          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1991          * which waits for the parent to create the new user namespace while staying in the original namespace. The
1992          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1993          * continues execution normally.
1994          * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
1995          * does not need CAP_SETUID to write the single line mapping to itself. */
1996
1997         /* Can only set up multiple mappings with CAP_SETUID. */
1998         if (have_effective_cap(CAP_SETUID) && uid != ouid && uid_is_valid(uid))
1999                 r = asprintf(&uid_map,
2000                              UID_FMT " " UID_FMT " 1\n"     /* Map $OUID → $OUID */
2001                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
2002                              ouid, ouid, uid, uid);
2003         else
2004                 r = asprintf(&uid_map,
2005                              UID_FMT " " UID_FMT " 1\n",    /* Map $OUID → $OUID */
2006                              ouid, ouid);
2007
2008         if (r < 0)
2009                 return -ENOMEM;
2010
2011         /* Can only set up multiple mappings with CAP_SETGID. */
2012         if (have_effective_cap(CAP_SETGID) && gid != ogid && gid_is_valid(gid))
2013                 r = asprintf(&gid_map,
2014                              GID_FMT " " GID_FMT " 1\n"     /* Map $OGID → $OGID */
2015                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
2016                              ogid, ogid, gid, gid);
2017         else
2018                 r = asprintf(&gid_map,
2019                              GID_FMT " " GID_FMT " 1\n",    /* Map $OGID -> $OGID */
2020                              ogid, ogid);
2021
2022         if (r < 0)
2023                 return -ENOMEM;
2024
2025         /* Create a communication channel so that the parent can tell the child when it finished creating the user
2026          * namespace. */
2027         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2028         if (unshare_ready_fd < 0)
2029                 return -errno;
2030
2031         /* Create a communication channel so that the child can tell the parent a proper error code in case it
2032          * failed. */
2033         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2034                 return -errno;
2035
2036         r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2037         if (r < 0)
2038                 return r;
2039         if (r == 0) {
2040                 _cleanup_close_ int fd = -1;
2041                 const char *a;
2042                 pid_t ppid;
2043
2044                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2045                  * here, after the parent opened its own user namespace. */
2046
2047                 ppid = getppid();
2048                 errno_pipe[0] = safe_close(errno_pipe[0]);
2049
2050                 /* Wait until the parent unshared the user namespace */
2051                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2052                         r = -errno;
2053                         goto child_fail;
2054                 }
2055
2056                 /* Disable the setgroups() system call in the child user namespace, for good. */
2057                 a = procfs_file_alloca(ppid, "setgroups");
2058                 fd = open(a, O_WRONLY|O_CLOEXEC);
2059                 if (fd < 0) {
2060                         if (errno != ENOENT) {
2061                                 r = -errno;
2062                                 goto child_fail;
2063                         }
2064
2065                         /* If the file is missing the kernel is too old, let's continue anyway. */
2066                 } else {
2067                         if (write(fd, "deny\n", 5) < 0) {
2068                                 r = -errno;
2069                                 goto child_fail;
2070                         }
2071
2072                         fd = safe_close(fd);
2073                 }
2074
2075                 /* First write the GID map */
2076                 a = procfs_file_alloca(ppid, "gid_map");
2077                 fd = open(a, O_WRONLY|O_CLOEXEC);
2078                 if (fd < 0) {
2079                         r = -errno;
2080                         goto child_fail;
2081                 }
2082                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2083                         r = -errno;
2084                         goto child_fail;
2085                 }
2086                 fd = safe_close(fd);
2087
2088                 /* The write the UID map */
2089                 a = procfs_file_alloca(ppid, "uid_map");
2090                 fd = open(a, O_WRONLY|O_CLOEXEC);
2091                 if (fd < 0) {
2092                         r = -errno;
2093                         goto child_fail;
2094                 }
2095                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2096                         r = -errno;
2097                         goto child_fail;
2098                 }
2099
2100                 _exit(EXIT_SUCCESS);
2101
2102         child_fail:
2103                 (void) write(errno_pipe[1], &r, sizeof(r));
2104                 _exit(EXIT_FAILURE);
2105         }
2106
2107         errno_pipe[1] = safe_close(errno_pipe[1]);
2108
2109         if (unshare(CLONE_NEWUSER) < 0)
2110                 return -errno;
2111
2112         /* Let the child know that the namespace is ready now */
2113         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2114                 return -errno;
2115
2116         /* Try to read an error code from the child */
2117         n = read(errno_pipe[0], &r, sizeof(r));
2118         if (n < 0)
2119                 return -errno;
2120         if (n == sizeof(r)) { /* an error code was sent to us */
2121                 if (r < 0)
2122                         return r;
2123                 return -EIO;
2124         }
2125         if (n != 0) /* on success we should have read 0 bytes */
2126                 return -EIO;
2127
2128         r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
2129         pid = 0;
2130         if (r < 0)
2131                 return r;
2132         if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2133                 return -EIO;
2134
2135         return 0;
2136 }
2137
2138 static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2139         if (!context->dynamic_user)
2140                 return false;
2141
2142         if (type == EXEC_DIRECTORY_CONFIGURATION)
2143                 return false;
2144
2145         if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2146                 return false;
2147
2148         return true;
2149 }
2150
2151 static int setup_exec_directory(
2152                 const ExecContext *context,
2153                 const ExecParameters *params,
2154                 uid_t uid,
2155                 gid_t gid,
2156                 ExecDirectoryType type,
2157                 int *exit_status) {
2158
2159         static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2160                 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2161                 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2162                 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2163                 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2164                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2165         };
2166         char **rt;
2167         int r;
2168
2169         assert(context);
2170         assert(params);
2171         assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2172         assert(exit_status);
2173
2174         if (!params->prefix[type])
2175                 return 0;
2176
2177         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2178                 if (!uid_is_valid(uid))
2179                         uid = 0;
2180                 if (!gid_is_valid(gid))
2181                         gid = 0;
2182         }
2183
2184         STRV_FOREACH(rt, context->directories[type].paths) {
2185                 _cleanup_free_ char *p = NULL, *pp = NULL;
2186
2187                 p = path_join(params->prefix[type], *rt);
2188                 if (!p) {
2189                         r = -ENOMEM;
2190                         goto fail;
2191                 }
2192
2193                 r = mkdir_parents_label(p, 0755);
2194                 if (r < 0)
2195                         goto fail;
2196
2197                 if (exec_directory_is_private(context, type)) {
2198                         _cleanup_free_ char *private_root = NULL;
2199
2200                         /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2201                          * case we want to avoid leaving a directory around fully accessible that is owned by
2202                          * a dynamic user whose UID is later on reused. To lock this down we use the same
2203                          * trick used by container managers to prohibit host users to get access to files of
2204                          * the same UID in containers: we place everything inside a directory that has an
2205                          * access mode of 0700 and is owned root:root, so that it acts as security boundary
2206                          * for unprivileged host code. We then use fs namespacing to make this directory
2207                          * permeable for the service itself.
2208                          *
2209                          * Specifically: for a service which wants a special directory "foo/" we first create
2210                          * a directory "private/" with access mode 0700 owned by root:root. Then we place
2211                          * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2212                          * "private/foo". This way, privileged host users can access "foo/" as usual, but
2213                          * unprivileged host users can't look into it. Inside of the namespace of the unit
2214                          * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2215                          * "private/foo/" is mounted under the same name, thus disabling the access boundary
2216                          * for the service and making sure it only gets access to the dirs it needs but no
2217                          * others. Tricky? Yes, absolutely, but it works!
2218                          *
2219                          * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2220                          * to be owned by the service itself.
2221                          *
2222                          * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2223                          * for sharing files or sockets with other services. */
2224
2225                         private_root = path_join(params->prefix[type], "private");
2226                         if (!private_root) {
2227                                 r = -ENOMEM;
2228                                 goto fail;
2229                         }
2230
2231                         /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2232                         r = mkdir_safe_label(private_root, 0700, 0, 0, MKDIR_WARN_MODE);
2233                         if (r < 0)
2234                                 goto fail;
2235
2236                         pp = path_join(private_root, *rt);
2237                         if (!pp) {
2238                                 r = -ENOMEM;
2239                                 goto fail;
2240                         }
2241
2242                         /* Create all directories between the configured directory and this private root, and mark them 0755 */
2243                         r = mkdir_parents_label(pp, 0755);
2244                         if (r < 0)
2245                                 goto fail;
2246
2247                         if (is_dir(p, false) > 0 &&
2248                             (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2249
2250                                 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2251                                  * it over. Most likely the service has been upgraded from one that didn't use
2252                                  * DynamicUser=1, to one that does. */
2253
2254                                 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2255                                          "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2256                                          exec_directory_type_to_string(type), p, pp);
2257
2258                                 if (rename(p, pp) < 0) {
2259                                         r = -errno;
2260                                         goto fail;
2261                                 }
2262                         } else {
2263                                 /* Otherwise, create the actual directory for the service */
2264
2265                                 r = mkdir_label(pp, context->directories[type].mode);
2266                                 if (r < 0 && r != -EEXIST)
2267                                         goto fail;
2268                         }
2269
2270                         /* And link it up from the original place */
2271                         r = symlink_idempotent(pp, p, true);
2272                         if (r < 0)
2273                                 goto fail;
2274
2275                 } else {
2276                         _cleanup_free_ char *target = NULL;
2277
2278                         if (type != EXEC_DIRECTORY_CONFIGURATION &&
2279                             readlink_and_make_absolute(p, &target) >= 0) {
2280                                 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2281
2282                                 /* This already exists and is a symlink? Interesting. Maybe it's one created
2283                                  * by DynamicUser=1 (see above)?
2284                                  *
2285                                  * We do this for all directory types except for ConfigurationDirectory=,
2286                                  * since they all support the private/ symlink logic at least in some
2287                                  * configurations, see above. */
2288
2289                                 r = chase_symlinks(target, NULL, 0, &target_resolved, NULL);
2290                                 if (r < 0)
2291                                         goto fail;
2292
2293                                 q = path_join(params->prefix[type], "private", *rt);
2294                                 if (!q) {
2295                                         r = -ENOMEM;
2296                                         goto fail;
2297                                 }
2298
2299                                 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2300                                 r = chase_symlinks(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2301                                 if (r < 0)
2302                                         goto fail;
2303
2304                                 if (path_equal(q_resolved, target_resolved)) {
2305
2306                                         /* Hmm, apparently DynamicUser= was once turned on for this service,
2307                                          * but is no longer. Let's move the directory back up. */
2308
2309                                         log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2310                                                  "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2311                                                  exec_directory_type_to_string(type), q, p);
2312
2313                                         if (unlink(p) < 0) {
2314                                                 r = -errno;
2315                                                 goto fail;
2316                                         }
2317
2318                                         if (rename(q, p) < 0) {
2319                                                 r = -errno;
2320                                                 goto fail;
2321                                         }
2322                                 }
2323                         }
2324
2325                         r = mkdir_label(p, context->directories[type].mode);
2326                         if (r < 0) {
2327                                 if (r != -EEXIST)
2328                                         goto fail;
2329
2330                                 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2331                                         struct stat st;
2332
2333                                         /* Don't change the owner/access mode of the configuration directory,
2334                                          * as in the common case it is not written to by a service, and shall
2335                                          * not be writable. */
2336
2337                                         if (stat(p, &st) < 0) {
2338                                                 r = -errno;
2339                                                 goto fail;
2340                                         }
2341
2342                                         /* Still complain if the access mode doesn't match */
2343                                         if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2344                                                 log_warning("%s \'%s\' already exists but the mode is different. "
2345                                                             "(File system: %o %sMode: %o)",
2346                                                             exec_directory_type_to_string(type), *rt,
2347                                                             st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2348
2349                                         continue;
2350                                 }
2351                         }
2352                 }
2353
2354                 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2355                  * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2356                  * current UID/GID ownership.) */
2357                 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2358                 if (r < 0)
2359                         goto fail;
2360
2361                 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2362                  * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2363                  * assignments to exist.*/
2364                 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
2365                 if (r < 0)
2366                         goto fail;
2367         }
2368
2369         return 0;
2370
2371 fail:
2372         *exit_status = exit_status_table[type];
2373         return r;
2374 }
2375
2376 #if ENABLE_SMACK
2377 static int setup_smack(
2378                 const ExecContext *context,
2379                 const ExecCommand *command) {
2380
2381         int r;
2382
2383         assert(context);
2384         assert(command);
2385
2386         if (context->smack_process_label) {
2387                 r = mac_smack_apply_pid(0, context->smack_process_label);
2388                 if (r < 0)
2389                         return r;
2390         }
2391 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2392         else {
2393                 _cleanup_free_ char *exec_label = NULL;
2394
2395                 r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
2396                 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
2397                         return r;
2398
2399                 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2400                 if (r < 0)
2401                         return r;
2402         }
2403 #endif
2404
2405         return 0;
2406 }
2407 #endif
2408
2409 static int compile_bind_mounts(
2410                 const ExecContext *context,
2411                 const ExecParameters *params,
2412                 BindMount **ret_bind_mounts,
2413                 size_t *ret_n_bind_mounts,
2414                 char ***ret_empty_directories) {
2415
2416         _cleanup_strv_free_ char **empty_directories = NULL;
2417         BindMount *bind_mounts;
2418         size_t n, h = 0, i;
2419         ExecDirectoryType t;
2420         int r;
2421
2422         assert(context);
2423         assert(params);
2424         assert(ret_bind_mounts);
2425         assert(ret_n_bind_mounts);
2426         assert(ret_empty_directories);
2427
2428         n = context->n_bind_mounts;
2429         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2430                 if (!params->prefix[t])
2431                         continue;
2432
2433                 n += strv_length(context->directories[t].paths);
2434         }
2435
2436         if (n <= 0) {
2437                 *ret_bind_mounts = NULL;
2438                 *ret_n_bind_mounts = 0;
2439                 *ret_empty_directories = NULL;
2440                 return 0;
2441         }
2442
2443         bind_mounts = new(BindMount, n);
2444         if (!bind_mounts)
2445                 return -ENOMEM;
2446
2447         for (i = 0; i < context->n_bind_mounts; i++) {
2448                 BindMount *item = context->bind_mounts + i;
2449                 char *s, *d;
2450
2451                 s = strdup(item->source);
2452                 if (!s) {
2453                         r = -ENOMEM;
2454                         goto finish;
2455                 }
2456
2457                 d = strdup(item->destination);
2458                 if (!d) {
2459                         free(s);
2460                         r = -ENOMEM;
2461                         goto finish;
2462                 }
2463
2464                 bind_mounts[h++] = (BindMount) {
2465                         .source = s,
2466                         .destination = d,
2467                         .read_only = item->read_only,
2468                         .recursive = item->recursive,
2469                         .ignore_enoent = item->ignore_enoent,
2470                 };
2471         }
2472
2473         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2474                 char **suffix;
2475
2476                 if (!params->prefix[t])
2477                         continue;
2478
2479                 if (strv_isempty(context->directories[t].paths))
2480                         continue;
2481
2482                 if (exec_directory_is_private(context, t) &&
2483                     !(context->root_directory || context->root_image)) {
2484                         char *private_root;
2485
2486                         /* So this is for a dynamic user, and we need to make sure the process can access its own
2487                          * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2488                          * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2489
2490                         private_root = path_join(params->prefix[t], "private");
2491                         if (!private_root) {
2492                                 r = -ENOMEM;
2493                                 goto finish;
2494                         }
2495
2496                         r = strv_consume(&empty_directories, private_root);
2497                         if (r < 0)
2498                                 goto finish;
2499                 }
2500
2501                 STRV_FOREACH(suffix, context->directories[t].paths) {
2502                         char *s, *d;
2503
2504                         if (exec_directory_is_private(context, t))
2505                                 s = path_join(params->prefix[t], "private", *suffix);
2506                         else
2507                                 s = path_join(params->prefix[t], *suffix);
2508                         if (!s) {
2509                                 r = -ENOMEM;
2510                                 goto finish;
2511                         }
2512
2513                         if (exec_directory_is_private(context, t) &&
2514                             (context->root_directory || context->root_image))
2515                                 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2516                                  * directory is not created on the root directory. So, let's bind-mount the directory
2517                                  * on the 'non-private' place. */
2518                                 d = path_join(params->prefix[t], *suffix);
2519                         else
2520                                 d = strdup(s);
2521                         if (!d) {
2522                                 free(s);
2523                                 r = -ENOMEM;
2524                                 goto finish;
2525                         }
2526
2527                         bind_mounts[h++] = (BindMount) {
2528                                 .source = s,
2529                                 .destination = d,
2530                                 .read_only = false,
2531                                 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
2532                                 .recursive = true,
2533                                 .ignore_enoent = false,
2534                         };
2535                 }
2536         }
2537
2538         assert(h == n);
2539
2540         *ret_bind_mounts = bind_mounts;
2541         *ret_n_bind_mounts = n;
2542         *ret_empty_directories = TAKE_PTR(empty_directories);
2543
2544         return (int) n;
2545
2546 finish:
2547         bind_mount_free_many(bind_mounts, h);
2548         return r;
2549 }
2550
2551 static bool insist_on_sandboxing(
2552                 const ExecContext *context,
2553                 const char *root_dir,
2554                 const char *root_image,
2555                 const BindMount *bind_mounts,
2556                 size_t n_bind_mounts) {
2557
2558         size_t i;
2559
2560         assert(context);
2561         assert(n_bind_mounts == 0 || bind_mounts);
2562
2563         /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
2564          * would alter the view on the file system beyond making things read-only or invisible, i.e. would
2565          * rearrange stuff in a way we cannot ignore gracefully. */
2566
2567         if (context->n_temporary_filesystems > 0)
2568                 return true;
2569
2570         if (root_dir || root_image)
2571                 return true;
2572
2573         if (context->dynamic_user)
2574                 return true;
2575
2576         /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
2577          * essential. */
2578         for (i = 0; i < n_bind_mounts; i++)
2579                 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
2580                         return true;
2581
2582         if (context->log_namespace)
2583                 return true;
2584
2585         return false;
2586 }
2587
2588 static int apply_mount_namespace(
2589                 const Unit *u,
2590                 const ExecCommand *command,
2591                 const ExecContext *context,
2592                 const ExecParameters *params,
2593                 const ExecRuntime *runtime,
2594                 char **error_path) {
2595
2596         _cleanup_strv_free_ char **empty_directories = NULL;
2597         const char *tmp_dir = NULL, *var_tmp_dir = NULL;
2598         const char *root_dir = NULL, *root_image = NULL;
2599         NamespaceInfo ns_info;
2600         bool needs_sandboxing;
2601         BindMount *bind_mounts = NULL;
2602         size_t n_bind_mounts = 0;
2603         int r;
2604
2605         assert(context);
2606
2607         if (params->flags & EXEC_APPLY_CHROOT) {
2608                 root_image = context->root_image;
2609
2610                 if (!root_image)
2611                         root_dir = context->root_directory;
2612         }
2613
2614         r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
2615         if (r < 0)
2616                 return r;
2617
2618         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2619         if (needs_sandboxing) {
2620                 /* The runtime struct only contains the parent of the private /tmp,
2621                  * which is non-accessible to world users. Inside of it there's a /tmp
2622                  * that is sticky, and that's the one we want to use here.
2623                  * This does not apply when we are using /run/systemd/empty as fallback. */
2624
2625                 if (context->private_tmp && runtime) {
2626                         if (streq_ptr(runtime->tmp_dir, RUN_SYSTEMD_EMPTY))
2627                                 tmp_dir = runtime->tmp_dir;
2628                         else if (runtime->tmp_dir)
2629                                 tmp_dir = strjoina(runtime->tmp_dir, "/tmp");
2630
2631                         if (streq_ptr(runtime->var_tmp_dir, RUN_SYSTEMD_EMPTY))
2632                                 var_tmp_dir = runtime->var_tmp_dir;
2633                         else if (runtime->tmp_dir)
2634                                 var_tmp_dir = strjoina(runtime->var_tmp_dir, "/tmp");
2635                 }
2636
2637                 ns_info = (NamespaceInfo) {
2638                         .ignore_protect_paths = false,
2639                         .private_dev = context->private_devices,
2640                         .protect_control_groups = context->protect_control_groups,
2641                         .protect_kernel_tunables = context->protect_kernel_tunables,
2642                         .protect_kernel_modules = context->protect_kernel_modules,
2643                         .protect_kernel_logs = context->protect_kernel_logs,
2644                         .protect_hostname = context->protect_hostname,
2645                         .mount_apivfs = context->mount_apivfs,
2646                         .private_mounts = context->private_mounts,
2647                 };
2648         } else if (!context->dynamic_user && root_dir)
2649                 /*
2650                  * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2651                  * sandbox info, otherwise enforce it, don't ignore protected paths and
2652                  * fail if we are enable to apply the sandbox inside the mount namespace.
2653                  */
2654                 ns_info = (NamespaceInfo) {
2655                         .ignore_protect_paths = true,
2656                 };
2657         else
2658                 ns_info = (NamespaceInfo) {};
2659
2660         if (context->mount_flags == MS_SHARED)
2661                 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
2662
2663         r = setup_namespace(root_dir, root_image,
2664                             &ns_info, context->read_write_paths,
2665                             needs_sandboxing ? context->read_only_paths : NULL,
2666                             needs_sandboxing ? context->inaccessible_paths : NULL,
2667                             empty_directories,
2668                             bind_mounts,
2669                             n_bind_mounts,
2670                             context->temporary_filesystems,
2671                             context->n_temporary_filesystems,
2672                             tmp_dir,
2673                             var_tmp_dir,
2674                             context->log_namespace,
2675                             needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
2676                             needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
2677                             context->mount_flags,
2678                             context->root_hash, context->root_hash_size, context->root_hash_path,
2679                             context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
2680                             context->root_verity,
2681                             DISSECT_IMAGE_DISCARD_ON_LOOP|DISSECT_IMAGE_RELAX_VAR_CHECK|DISSECT_IMAGE_FSCK,
2682                             error_path);
2683
2684         /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
2685          * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
2686          * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
2687          * completely different execution environment. */
2688         if (r == -ENOANO) {
2689                 if (insist_on_sandboxing(
2690                                     context,
2691                                     root_dir, root_image,
2692                                     bind_mounts,
2693                                     n_bind_mounts)) {
2694                         log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
2695                                        "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
2696                                        n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
2697
2698                         r = -EOPNOTSUPP;
2699                 } else {
2700                         log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
2701                         r = 0;
2702                 }
2703         }
2704
2705         bind_mount_free_many(bind_mounts, n_bind_mounts);
2706         return r;
2707 }
2708
2709 static int apply_working_directory(
2710                 const ExecContext *context,
2711                 const ExecParameters *params,
2712                 const char *home,
2713                 int *exit_status) {
2714
2715         const char *d, *wd;
2716
2717         assert(context);
2718         assert(exit_status);
2719
2720         if (context->working_directory_home) {
2721
2722                 if (!home) {
2723                         *exit_status = EXIT_CHDIR;
2724                         return -ENXIO;
2725                 }
2726
2727                 wd = home;
2728
2729         } else if (context->working_directory)
2730                 wd = context->working_directory;
2731         else
2732                 wd = "/";
2733
2734         if (params->flags & EXEC_APPLY_CHROOT)
2735                 d = wd;
2736         else
2737                 d = prefix_roota(context->root_directory, wd);
2738
2739         if (chdir(d) < 0 && !context->working_directory_missing_ok) {
2740                 *exit_status = EXIT_CHDIR;
2741                 return -errno;
2742         }
2743
2744         return 0;
2745 }
2746
2747 static int apply_root_directory(
2748                 const ExecContext *context,
2749                 const ExecParameters *params,
2750                 const bool needs_mount_ns,
2751                 int *exit_status) {
2752
2753         assert(context);
2754         assert(exit_status);
2755
2756         if (params->flags & EXEC_APPLY_CHROOT) {
2757                 if (!needs_mount_ns && context->root_directory)
2758                         if (chroot(context->root_directory) < 0) {
2759                                 *exit_status = EXIT_CHROOT;
2760                                 return -errno;
2761                         }
2762         }
2763
2764         return 0;
2765 }
2766
2767 static int setup_keyring(
2768                 const Unit *u,
2769                 const ExecContext *context,
2770                 const ExecParameters *p,
2771                 uid_t uid, gid_t gid) {
2772
2773         key_serial_t keyring;
2774         int r = 0;
2775         uid_t saved_uid;
2776         gid_t saved_gid;
2777
2778         assert(u);
2779         assert(context);
2780         assert(p);
2781
2782         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2783          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2784          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2785          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2786          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2787          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2788
2789         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
2790                 return 0;
2791
2792         /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
2793          * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
2794          * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
2795          * & group is just as nasty as acquiring a reference to the user keyring. */
2796
2797         saved_uid = getuid();
2798         saved_gid = getgid();
2799
2800         if (gid_is_valid(gid) && gid != saved_gid) {
2801                 if (setregid(gid, -1) < 0)
2802                         return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
2803         }
2804
2805         if (uid_is_valid(uid) && uid != saved_uid) {
2806                 if (setreuid(uid, -1) < 0) {
2807                         r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
2808                         goto out;
2809                 }
2810         }
2811
2812         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2813         if (keyring == -1) {
2814                 if (errno == ENOSYS)
2815                         log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
2816                 else if (IN_SET(errno, EACCES, EPERM))
2817                         log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
2818                 else if (errno == EDQUOT)
2819                         log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
2820                 else
2821                         r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
2822
2823                 goto out;
2824         }
2825
2826         /* When requested link the user keyring into the session keyring. */
2827         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
2828
2829                 if (keyctl(KEYCTL_LINK,
2830                            KEY_SPEC_USER_KEYRING,
2831                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
2832                         r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
2833                         goto out;
2834                 }
2835         }
2836
2837         /* Restore uid/gid back */
2838         if (uid_is_valid(uid) && uid != saved_uid) {
2839                 if (setreuid(saved_uid, -1) < 0) {
2840                         r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
2841                         goto out;
2842                 }
2843         }
2844
2845         if (gid_is_valid(gid) && gid != saved_gid) {
2846                 if (setregid(saved_gid, -1) < 0)
2847                         return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
2848         }
2849
2850         /* Populate they keyring with the invocation ID by default, as original saved_uid. */
2851         if (!sd_id128_is_null(u->invocation_id)) {
2852                 key_serial_t key;
2853
2854                 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
2855                 if (key == -1)
2856                         log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
2857                 else {
2858                         if (keyctl(KEYCTL_SETPERM, key,
2859                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
2860                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
2861                                 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
2862                 }
2863         }
2864
2865 out:
2866         /* Revert back uid & gid for the last time, and exit */
2867         /* no extra logging, as only the first already reported error matters */
2868         if (getuid() != saved_uid)
2869                 (void) setreuid(saved_uid, -1);
2870
2871         if (getgid() != saved_gid)
2872                 (void) setregid(saved_gid, -1);
2873
2874         return r;
2875 }
2876
2877 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
2878         assert(array);
2879         assert(n);
2880         assert(pair);
2881
2882         if (pair[0] >= 0)
2883                 array[(*n)++] = pair[0];
2884         if (pair[1] >= 0)
2885                 array[(*n)++] = pair[1];
2886 }
2887
2888 static int close_remaining_fds(
2889                 const ExecParameters *params,
2890                 const ExecRuntime *runtime,
2891                 const DynamicCreds *dcreds,
2892                 int user_lookup_fd,
2893                 int socket_fd,
2894                 int exec_fd,
2895                 const int *fds, size_t n_fds) {
2896
2897         size_t n_dont_close = 0;
2898         int dont_close[n_fds + 12];
2899
2900         assert(params);
2901
2902         if (params->stdin_fd >= 0)
2903                 dont_close[n_dont_close++] = params->stdin_fd;
2904         if (params->stdout_fd >= 0)
2905                 dont_close[n_dont_close++] = params->stdout_fd;
2906         if (params->stderr_fd >= 0)
2907                 dont_close[n_dont_close++] = params->stderr_fd;
2908
2909         if (socket_fd >= 0)
2910                 dont_close[n_dont_close++] = socket_fd;
2911         if (exec_fd >= 0)
2912                 dont_close[n_dont_close++] = exec_fd;
2913         if (n_fds > 0) {
2914                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
2915                 n_dont_close += n_fds;
2916         }
2917
2918         if (runtime)
2919                 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
2920
2921         if (dcreds) {
2922                 if (dcreds->user)
2923                         append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
2924                 if (dcreds->group)
2925                         append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
2926         }
2927
2928         if (user_lookup_fd >= 0)
2929                 dont_close[n_dont_close++] = user_lookup_fd;
2930
2931         return close_all_fds(dont_close, n_dont_close);
2932 }
2933
2934 static int send_user_lookup(
2935                 Unit *unit,
2936                 int user_lookup_fd,
2937                 uid_t uid,
2938                 gid_t gid) {
2939
2940         assert(unit);
2941
2942         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2943          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2944          * specified. */
2945
2946         if (user_lookup_fd < 0)
2947                 return 0;
2948
2949         if (!uid_is_valid(uid) && !gid_is_valid(gid))
2950                 return 0;
2951
2952         if (writev(user_lookup_fd,
2953                (struct iovec[]) {
2954                            IOVEC_INIT(&uid, sizeof(uid)),
2955                            IOVEC_INIT(&gid, sizeof(gid)),
2956                            IOVEC_INIT_STRING(unit->id) }, 3) < 0)
2957                 return -errno;
2958
2959         return 0;
2960 }
2961
2962 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
2963         int r;
2964
2965         assert(c);
2966         assert(home);
2967         assert(buf);
2968
2969         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2970
2971         if (*home)
2972                 return 0;
2973
2974         if (!c->working_directory_home)
2975                 return 0;
2976
2977         r = get_home_dir(buf);
2978         if (r < 0)
2979                 return r;
2980
2981         *home = *buf;
2982         return 1;
2983 }
2984
2985 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
2986         _cleanup_strv_free_ char ** list = NULL;
2987         ExecDirectoryType t;
2988         int r;
2989
2990         assert(c);
2991         assert(p);
2992         assert(ret);
2993
2994         assert(c->dynamic_user);
2995
2996         /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2997          * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2998          * directories. */
2999
3000         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3001                 char **i;
3002
3003                 if (t == EXEC_DIRECTORY_CONFIGURATION)
3004                         continue;
3005
3006                 if (!p->prefix[t])
3007                         continue;
3008
3009                 STRV_FOREACH(i, c->directories[t].paths) {
3010                         char *e;
3011
3012                         if (exec_directory_is_private(c, t))
3013                                 e = path_join(p->prefix[t], "private", *i);
3014                         else
3015                                 e = path_join(p->prefix[t], *i);
3016                         if (!e)
3017                                 return -ENOMEM;
3018
3019                         r = strv_consume(&list, e);
3020                         if (r < 0)
3021                                 return r;
3022                 }
3023         }
3024
3025         *ret = TAKE_PTR(list);
3026
3027         return 0;
3028 }
3029
3030 static char *exec_command_line(char **argv);
3031
3032 static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
3033         bool using_subcgroup;
3034         char *p;
3035
3036         assert(params);
3037         assert(ret);
3038
3039         if (!params->cgroup_path)
3040                 return -EINVAL;
3041
3042         /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
3043          * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
3044          * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
3045          * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
3046          * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
3047          * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
3048          * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
3049          * flag, which is only passed for the former statements, not for the latter. */
3050
3051         using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
3052         if (using_subcgroup)
3053                 p = path_join(params->cgroup_path, ".control");
3054         else
3055                 p = strdup(params->cgroup_path);
3056         if (!p)
3057                 return -ENOMEM;
3058
3059         *ret = p;
3060         return using_subcgroup;
3061 }
3062
3063 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
3064         _cleanup_(cpu_set_reset) CPUSet s = {};
3065         int r;
3066
3067         assert(c);
3068         assert(ret);
3069
3070         if (!c->numa_policy.nodes.set) {
3071                 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3072                 return 0;
3073         }
3074
3075         r = numa_to_cpu_set(&c->numa_policy, &s);
3076         if (r < 0)
3077                 return r;
3078
3079         cpu_set_reset(ret);
3080
3081         return cpu_set_add_all(ret, &s);
3082 }
3083
3084 bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
3085         assert(c);
3086
3087         return c->cpu_affinity_from_numa;
3088 }
3089
3090 static int exec_child(
3091                 Unit *unit,
3092                 const ExecCommand *command,
3093                 const ExecContext *context,
3094                 const ExecParameters *params,
3095                 ExecRuntime *runtime,
3096                 DynamicCreds *dcreds,
3097                 int socket_fd,
3098                 const int named_iofds[static 3],
3099                 int *fds,
3100                 size_t n_socket_fds,
3101                 size_t n_storage_fds,
3102                 char **files_env,
3103                 int user_lookup_fd,
3104                 int *exit_status) {
3105
3106         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **replaced_argv = NULL;
3107         int *fds_with_exec_fd, n_fds_with_exec_fd, r, ngids = 0, exec_fd = -1;
3108         _cleanup_free_ gid_t *supplementary_gids = NULL;
3109         const char *username = NULL, *groupname = NULL;
3110         _cleanup_free_ char *home_buffer = NULL;
3111         const char *home = NULL, *shell = NULL;
3112         char **final_argv = NULL;
3113         dev_t journal_stream_dev = 0;
3114         ino_t journal_stream_ino = 0;
3115         bool userns_set_up = false;
3116         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
3117                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
3118                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
3119                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
3120 #if HAVE_SELINUX
3121         _cleanup_free_ char *mac_selinux_context_net = NULL;
3122         bool use_selinux = false;
3123 #endif
3124 #if ENABLE_SMACK
3125         bool use_smack = false;
3126 #endif
3127 #if HAVE_APPARMOR
3128         bool use_apparmor = false;
3129 #endif
3130         uid_t saved_uid = getuid();
3131         gid_t saved_gid = getgid();
3132         uid_t uid = UID_INVALID;
3133         gid_t gid = GID_INVALID;
3134         size_t n_fds;
3135         ExecDirectoryType dt;
3136         int secure_bits;
3137         _cleanup_free_ gid_t *gids_after_pam = NULL;
3138         int ngids_after_pam = 0;
3139
3140         assert(unit);
3141         assert(command);
3142         assert(context);
3143         assert(params);
3144         assert(exit_status);
3145
3146         rename_process_from_path(command->path);
3147
3148         /* We reset exactly these signals, since they are the
3149          * only ones we set to SIG_IGN in the main daemon. All
3150          * others we leave untouched because we set them to
3151          * SIG_DFL or a valid handler initially, both of which
3152          * will be demoted to SIG_DFL. */
3153         (void) default_signals(SIGNALS_CRASH_HANDLER,
3154                                SIGNALS_IGNORE, -1);
3155
3156         if (context->ignore_sigpipe)
3157                 (void) ignore_signals(SIGPIPE, -1);
3158
3159         r = reset_signal_mask();
3160         if (r < 0) {
3161                 *exit_status = EXIT_SIGNAL_MASK;
3162                 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
3163         }
3164
3165         if (params->idle_pipe)
3166                 do_idle_pipe_dance(params->idle_pipe);
3167
3168         /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
3169          * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
3170          * any fds open we don't really want open during the transition. In order to make logging work, we switch the
3171          * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
3172
3173         log_forget_fds();
3174         log_set_open_when_needed(true);
3175
3176         /* In case anything used libc syslog(), close this here, too */
3177         closelog();
3178
3179         n_fds = n_socket_fds + n_storage_fds;
3180         r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, params->exec_fd, fds, n_fds);
3181         if (r < 0) {
3182                 *exit_status = EXIT_FDS;
3183                 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
3184         }
3185
3186         if (!context->same_pgrp)
3187                 if (setsid() < 0) {
3188                         *exit_status = EXIT_SETSID;
3189                         return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
3190                 }
3191
3192         exec_context_tty_reset(context, params);
3193
3194         if (unit_shall_confirm_spawn(unit)) {
3195                 const char *vc = params->confirm_spawn;
3196                 _cleanup_free_ char *cmdline = NULL;
3197
3198                 cmdline = exec_command_line(command->argv);
3199                 if (!cmdline) {
3200                         *exit_status = EXIT_MEMORY;
3201                         return log_oom();
3202                 }
3203
3204                 r = ask_for_confirmation(vc, unit, cmdline);
3205                 if (r != CONFIRM_EXECUTE) {
3206                         if (r == CONFIRM_PRETEND_SUCCESS) {
3207                                 *exit_status = EXIT_SUCCESS;
3208                                 return 0;
3209                         }
3210                         *exit_status = EXIT_CONFIRM;
3211                         log_unit_error(unit, "Execution cancelled by the user");
3212                         return -ECANCELED;
3213                 }
3214         }
3215
3216         /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
3217          * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
3218          * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
3219          * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
3220          * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
3221         if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
3222             setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
3223                 *exit_status = EXIT_MEMORY;
3224                 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3225         }
3226
3227         if (context->dynamic_user && dcreds) {
3228                 _cleanup_strv_free_ char **suggested_paths = NULL;
3229
3230                 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
3231                  * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here.*/
3232                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
3233                         *exit_status = EXIT_USER;
3234                         return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3235                 }
3236
3237                 r = compile_suggested_paths(context, params, &suggested_paths);
3238                 if (r < 0) {
3239                         *exit_status = EXIT_MEMORY;
3240                         return log_oom();
3241                 }
3242
3243                 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
3244                 if (r < 0) {
3245                         *exit_status = EXIT_USER;
3246                         if (r == -EILSEQ) {
3247                                 log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists.");
3248                                 return -EOPNOTSUPP;
3249                         }
3250                         return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
3251                 }
3252
3253                 if (!uid_is_valid(uid)) {
3254                         *exit_status = EXIT_USER;
3255                         log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
3256                         return -ESRCH;
3257                 }
3258
3259                 if (!gid_is_valid(gid)) {
3260                         *exit_status = EXIT_USER;
3261                         log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
3262                         return -ESRCH;
3263                 }
3264
3265                 if (dcreds->user)
3266                         username = dcreds->user->name;
3267
3268         } else {
3269                 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
3270                 if (r < 0) {
3271                         *exit_status = EXIT_USER;
3272                         return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
3273                 }
3274
3275                 r = get_fixed_group(context, &groupname, &gid);
3276                 if (r < 0) {
3277                         *exit_status = EXIT_GROUP;
3278                         return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
3279                 }
3280         }
3281
3282         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
3283         r = get_supplementary_groups(context, username, groupname, gid,
3284                                      &supplementary_gids, &ngids);
3285         if (r < 0) {
3286                 *exit_status = EXIT_GROUP;
3287                 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
3288         }
3289
3290         r = send_user_lookup(unit, user_lookup_fd, uid, gid);
3291         if (r < 0) {
3292                 *exit_status = EXIT_USER;
3293                 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
3294         }
3295
3296         user_lookup_fd = safe_close(user_lookup_fd);
3297
3298         r = acquire_home(context, uid, &home, &home_buffer);
3299         if (r < 0) {
3300                 *exit_status = EXIT_CHDIR;
3301                 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
3302         }
3303
3304         /* If a socket is connected to STDIN/STDOUT/STDERR, we
3305          * must sure to drop O_NONBLOCK */
3306         if (socket_fd >= 0)
3307                 (void) fd_nonblock(socket_fd, false);
3308
3309         /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
3310          * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
3311         if (params->cgroup_path) {
3312                 _cleanup_free_ char *p = NULL;
3313
3314                 r = exec_parameters_get_cgroup_path(params, &p);
3315                 if (r < 0) {
3316                         *exit_status = EXIT_CGROUP;
3317                         return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
3318                 }
3319
3320                 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
3321                 if (r < 0) {
3322                         *exit_status = EXIT_CGROUP;
3323                         return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
3324                 }
3325         }
3326
3327         if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
3328                 r = open_netns_path(runtime->netns_storage_socket, context->network_namespace_path);
3329                 if (r < 0) {
3330                         *exit_status = EXIT_NETWORK;
3331                         return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
3332                 }
3333         }
3334
3335         r = setup_input(context, params, socket_fd, named_iofds);
3336         if (r < 0) {
3337                 *exit_status = EXIT_STDIN;
3338                 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
3339         }
3340
3341         r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3342         if (r < 0) {
3343                 *exit_status = EXIT_STDOUT;
3344                 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
3345         }
3346
3347         r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3348         if (r < 0) {
3349                 *exit_status = EXIT_STDERR;
3350                 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
3351         }
3352
3353         if (context->oom_score_adjust_set) {
3354                 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
3355                  * prohibit write access to this file, and we shouldn't trip up over that. */
3356                 r = set_oom_score_adjust(context->oom_score_adjust);
3357                 if (IN_SET(r, -EPERM, -EACCES))
3358                         log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
3359                 else if (r < 0) {
3360                         *exit_status = EXIT_OOM_ADJUST;
3361                         return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
3362                 }
3363         }
3364
3365         if (context->coredump_filter_set) {
3366                 r = set_coredump_filter(context->coredump_filter);
3367                 if (ERRNO_IS_PRIVILEGE(r))
3368                         log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
3369                 else if (r < 0)
3370                         return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
3371         }
3372
3373         if (context->nice_set) {
3374                 r = setpriority_closest(context->nice);
3375                 if (r < 0)
3376                         return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
3377         }
3378
3379         if (context->cpu_sched_set) {
3380                 struct sched_param param = {
3381                         .sched_priority = context->cpu_sched_priority,
3382                 };
3383
3384                 r = sched_setscheduler(0,
3385                                        context->cpu_sched_policy |
3386                                        (context->cpu_sched_reset_on_fork ?
3387                                         SCHED_RESET_ON_FORK : 0),
3388                                        &param);
3389                 if (r < 0) {
3390                         *exit_status = EXIT_SETSCHEDULER;
3391                         return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
3392                 }
3393         }
3394
3395         if (context->cpu_affinity_from_numa || context->cpu_set.set) {
3396                 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
3397                 const CPUSet *cpu_set;
3398
3399                 if (context->cpu_affinity_from_numa) {
3400                         r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
3401                         if (r < 0) {
3402                                 *exit_status = EXIT_CPUAFFINITY;
3403                                 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
3404                         }
3405
3406                         cpu_set = &converted_cpu_set;
3407                 } else
3408                         cpu_set = &context->cpu_set;
3409
3410                 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
3411                         *exit_status = EXIT_CPUAFFINITY;
3412                         return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
3413                 }
3414         }
3415
3416         if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
3417                 r = apply_numa_policy(&context->numa_policy);
3418                 if (r == -EOPNOTSUPP)
3419                         log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
3420                 else if (r < 0) {
3421                         *exit_status = EXIT_NUMA_POLICY;
3422                         return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
3423                 }
3424         }
3425
3426         if (context->ioprio_set)
3427                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
3428                         *exit_status = EXIT_IOPRIO;
3429                         return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
3430                 }
3431
3432         if (context->timer_slack_nsec != NSEC_INFINITY)
3433                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
3434                         *exit_status = EXIT_TIMERSLACK;
3435                         return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
3436                 }
3437
3438         if (context->personality != PERSONALITY_INVALID) {
3439                 r = safe_personality(context->personality);
3440                 if (r < 0) {
3441                         *exit_status = EXIT_PERSONALITY;
3442                         return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
3443                 }
3444         }
3445
3446         if (context->utmp_id)
3447                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
3448                                       context->tty_path,
3449                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
3450                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
3451                                       USER_PROCESS,
3452                                       username);
3453
3454         if (uid_is_valid(uid)) {
3455                 r = chown_terminal(STDIN_FILENO, uid);
3456                 if (r < 0) {
3457                         *exit_status = EXIT_STDIN;
3458                         return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
3459                 }
3460         }
3461
3462         /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
3463          * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
3464          * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
3465          * touch a single hierarchy too. */
3466         if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
3467                 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
3468                 if (r < 0) {
3469                         *exit_status = EXIT_CGROUP;
3470                         return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
3471                 }
3472         }
3473
3474         for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3475                 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
3476                 if (r < 0)
3477                         return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
3478         }
3479
3480         r = build_environment(
3481                         unit,
3482                         context,
3483                         params,
3484                         n_fds,
3485                         home,
3486                         username,
3487                         shell,
3488                         journal_stream_dev,
3489                         journal_stream_ino,
3490                         &our_env);
3491         if (r < 0) {
3492                 *exit_status = EXIT_MEMORY;
3493                 return log_oom();
3494         }
3495
3496         r = build_pass_environment(context, &pass_env);
3497         if (r < 0) {
3498                 *exit_status = EXIT_MEMORY;
3499                 return log_oom();
3500         }
3501
3502         accum_env = strv_env_merge(5,
3503                                    params->environment,
3504                                    our_env,
3505                                    pass_env,
3506                                    context->environment,
3507                                    files_env);
3508         if (!accum_env) {
3509                 *exit_status = EXIT_MEMORY;
3510                 return log_oom();
3511         }
3512         accum_env = strv_env_clean(accum_env);
3513
3514         (void) umask(context->umask);
3515
3516         r = setup_keyring(unit, context, params, uid, gid);
3517         if (r < 0) {
3518                 *exit_status = EXIT_KEYRING;
3519                 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
3520         }
3521
3522         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
3523         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3524
3525         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
3526         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
3527
3528         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
3529         if (needs_ambient_hack)
3530                 needs_setuid = false;
3531         else
3532                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
3533
3534         if (needs_sandboxing) {
3535                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3536                  * present. The actual MAC context application will happen later, as late as possible, to avoid
3537                  * impacting our own code paths. */
3538
3539 #if HAVE_SELINUX
3540                 use_selinux = mac_selinux_use();
3541 #endif
3542 #if ENABLE_SMACK
3543                 use_smack = mac_smack_use();
3544 #endif
3545 #if HAVE_APPARMOR
3546                 use_apparmor = mac_apparmor_use();
3547 #endif
3548         }
3549
3550         if (needs_sandboxing) {
3551                 int which_failed;
3552
3553                 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
3554                  * is set here. (See below.) */
3555
3556                 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
3557                 if (r < 0) {
3558                         *exit_status = EXIT_LIMITS;
3559                         return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3560                 }
3561         }
3562
3563         if (needs_setuid) {
3564
3565                 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
3566                  * wins here. (See above.) */
3567
3568                 if (context->pam_name && username) {
3569                         r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
3570                         if (r < 0) {
3571                                 *exit_status = EXIT_PAM;
3572                                 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
3573                         }
3574
3575                         ngids_after_pam = getgroups_alloc(&gids_after_pam);
3576                         if (ngids_after_pam < 0) {
3577                                 *exit_status = EXIT_MEMORY;
3578                                 return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
3579                         }
3580                 }
3581         }
3582
3583         if (needs_sandboxing) {
3584 #if HAVE_SELINUX
3585                 if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
3586                         r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
3587                         if (r < 0) {
3588                                 *exit_status = EXIT_SELINUX_CONTEXT;
3589                                 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
3590                         }
3591                 }
3592 #endif
3593
3594                 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
3595                  * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
3596                  * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
3597                 if (context->private_users && !have_effective_cap(CAP_SYS_ADMIN)) {
3598                         userns_set_up = true;
3599                         r = setup_private_users(saved_uid, saved_gid, uid, gid);
3600                         if (r < 0) {
3601                                 *exit_status = EXIT_USER;
3602                                 return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
3603                         }
3604                 }
3605         }
3606
3607         if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
3608
3609                 if (ns_type_supported(NAMESPACE_NET)) {
3610                         r = setup_netns(runtime->netns_storage_socket);
3611                         if (r == -EPERM)
3612                                 log_unit_warning_errno(unit, r,
3613                                                        "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
3614                         else if (r < 0) {
3615                                 *exit_status = EXIT_NETWORK;
3616                                 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
3617                         }
3618                 } else if (context->network_namespace_path) {
3619                         *exit_status = EXIT_NETWORK;
3620                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
3621                                                     "NetworkNamespacePath= is not supported, refusing.");
3622                 } else
3623                         log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3624         }
3625
3626         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
3627         if (needs_mount_namespace) {
3628                 _cleanup_free_ char *error_path = NULL;
3629
3630                 r = apply_mount_namespace(unit, command, context, params, runtime, &error_path);
3631                 if (r < 0) {
3632                         *exit_status = EXIT_NAMESPACE;
3633                         return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
3634                                                     error_path ? ": " : "", strempty(error_path));
3635                 }
3636         }
3637
3638         if (needs_sandboxing) {
3639                 r = apply_protect_hostname(unit, context, exit_status);
3640                 if (r < 0)
3641                         return r;
3642         }
3643
3644         /* Drop groups as early as possible.
3645          * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
3646          * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
3647         if (needs_setuid) {
3648                 _cleanup_free_ gid_t *gids_to_enforce = NULL;
3649                 int ngids_to_enforce = 0;
3650
3651                 ngids_to_enforce = merge_gid_lists(supplementary_gids,
3652                                                    ngids,
3653                                                    gids_after_pam,
3654                                                    ngids_after_pam,
3655                                                    &gids_to_enforce);
3656                 if (ngids_to_enforce < 0) {
3657                         *exit_status = EXIT_MEMORY;
3658                         return log_unit_error_errno(unit,
3659                                                     ngids_to_enforce,
3660                                                     "Failed to merge group lists. Group membership might be incorrect: %m");
3661                 }
3662
3663                 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
3664                 if (r < 0) {
3665                         *exit_status = EXIT_GROUP;
3666                         return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
3667                 }
3668         }
3669
3670         /* If the user namespace was not set up above, try to do it now.
3671          * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
3672          * restricted by rules pertaining to combining user namspaces with other namespaces (e.g. in the
3673          * case of mount namespaces being less privileged when the mount point list is copied from a
3674          * different user namespace). */
3675
3676         if (needs_sandboxing && context->private_users && !userns_set_up) {
3677                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
3678                 if (r < 0) {
3679                         *exit_status = EXIT_USER;
3680                         return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
3681                 }
3682         }
3683
3684         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3685          * more aggressive this time since socket_fd and the netns fds we don't need anymore. We do keep the exec_fd
3686          * however if we have it as we want to keep it open until the final execve(). */
3687
3688         if (params->exec_fd >= 0) {
3689                 exec_fd = params->exec_fd;
3690
3691                 if (exec_fd < 3 + (int) n_fds) {
3692                         int moved_fd;
3693
3694                         /* Let's move the exec fd far up, so that it's outside of the fd range we want to pass to the
3695                          * process we are about to execute. */
3696
3697                         moved_fd = fcntl(exec_fd, F_DUPFD_CLOEXEC, 3 + (int) n_fds);
3698                         if (moved_fd < 0) {
3699                                 *exit_status = EXIT_FDS;
3700                                 return log_unit_error_errno(unit, errno, "Couldn't move exec fd up: %m");
3701                         }
3702
3703                         safe_close(exec_fd);
3704                         exec_fd = moved_fd;
3705                 } else {
3706                         /* This fd should be FD_CLOEXEC already, but let's make sure. */
3707                         r = fd_cloexec(exec_fd, true);
3708                         if (r < 0) {
3709                                 *exit_status = EXIT_FDS;
3710                                 return log_unit_error_errno(unit, r, "Failed to make exec fd FD_CLOEXEC: %m");
3711                         }
3712                 }
3713
3714                 fds_with_exec_fd = newa(int, n_fds + 1);
3715                 memcpy_safe(fds_with_exec_fd, fds, n_fds * sizeof(int));
3716                 fds_with_exec_fd[n_fds] = exec_fd;
3717                 n_fds_with_exec_fd = n_fds + 1;
3718         } else {
3719                 fds_with_exec_fd = fds;
3720                 n_fds_with_exec_fd = n_fds;
3721         }
3722
3723         r = close_all_fds(fds_with_exec_fd, n_fds_with_exec_fd);
3724         if (r >= 0)
3725                 r = shift_fds(fds, n_fds);
3726         if (r >= 0)
3727                 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
3728         if (r < 0) {
3729                 *exit_status = EXIT_FDS;
3730                 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
3731         }
3732
3733         /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
3734          * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
3735          * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
3736          * came this far. */
3737
3738         secure_bits = context->secure_bits;
3739
3740         if (needs_sandboxing) {
3741                 uint64_t bset;
3742
3743                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
3744                  * requested. (Note this is placed after the general resource limit initialization, see
3745                  * above, in order to take precedence.) */
3746                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
3747                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
3748                                 *exit_status = EXIT_LIMITS;
3749                                 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3750                         }
3751                 }
3752
3753 #if ENABLE_SMACK
3754                 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
3755                  * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
3756                 if (use_smack) {
3757                         r = setup_smack(context, command);
3758                         if (r < 0) {
3759                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
3760                                 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
3761                         }
3762                 }
3763 #endif
3764
3765                 bset = context->capability_bounding_set;
3766                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3767                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3768                  * instead of us doing that */
3769                 if (needs_ambient_hack)
3770                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
3771                                 (UINT64_C(1) << CAP_SETUID) |
3772                                 (UINT64_C(1) << CAP_SETGID);
3773
3774                 if (!cap_test_all(bset)) {
3775                         r = capability_bounding_set_drop(bset, false);
3776                         if (r < 0) {
3777                                 *exit_status = EXIT_CAPABILITIES;
3778                                 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3779                         }
3780                 }
3781
3782                 /* This is done before enforce_user, but ambient set
3783                  * does not survive over setresuid() if keep_caps is not set. */
3784                 if (!needs_ambient_hack) {
3785                         r = capability_ambient_set_apply(context->capability_ambient_set, true);
3786                         if (r < 0) {
3787                                 *exit_status = EXIT_CAPABILITIES;
3788                                 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
3789                         }
3790                 }
3791         }
3792
3793         /* chroot to root directory first, before we lose the ability to chroot */
3794         r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
3795         if (r < 0)
3796                 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
3797
3798         if (needs_setuid) {
3799                 if (uid_is_valid(uid)) {
3800                         r = enforce_user(context, uid);
3801                         if (r < 0) {
3802                                 *exit_status = EXIT_USER;
3803                                 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
3804                         }
3805
3806                         if (!needs_ambient_hack &&
3807                             context->capability_ambient_set != 0) {
3808
3809                                 /* Fix the ambient capabilities after user change. */
3810                                 r = capability_ambient_set_apply(context->capability_ambient_set, false);
3811                                 if (r < 0) {
3812                                         *exit_status = EXIT_CAPABILITIES;
3813                                         return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
3814                                 }
3815
3816                                 /* If we were asked to change user and ambient capabilities
3817                                  * were requested, we had to add keep-caps to the securebits
3818                                  * so that we would maintain the inherited capability set
3819                                  * through the setresuid(). Make sure that the bit is added
3820                                  * also to the context secure_bits so that we don't try to
3821                                  * drop the bit away next. */
3822
3823                                 secure_bits |= 1<<SECURE_KEEP_CAPS;
3824                         }
3825                 }
3826         }
3827
3828         /* Apply working directory here, because the working directory might be on NFS and only the user running
3829          * this service might have the correct privilege to change to the working directory */
3830         r = apply_working_directory(context, params, home, exit_status);
3831         if (r < 0)
3832                 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
3833
3834         if (needs_sandboxing) {
3835                 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3836                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3837                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3838                  * are restricted. */
3839
3840 #if HAVE_SELINUX
3841                 if (use_selinux) {
3842                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
3843
3844                         if (exec_context) {
3845                                 r = setexeccon(exec_context);
3846                                 if (r < 0) {
3847                                         *exit_status = EXIT_SELINUX_CONTEXT;
3848                                         return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
3849                                 }
3850                         }
3851                 }
3852 #endif
3853
3854 #if HAVE_APPARMOR
3855                 if (use_apparmor && context->apparmor_profile) {
3856                         r = aa_change_onexec(context->apparmor_profile);
3857                         if (r < 0 && !context->apparmor_profile_ignore) {
3858                                 *exit_status = EXIT_APPARMOR_PROFILE;
3859                                 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
3860                         }
3861                 }
3862 #endif
3863
3864                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3865                  * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3866                 if (prctl(PR_GET_SECUREBITS) != secure_bits)
3867                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
3868                                 *exit_status = EXIT_SECUREBITS;
3869                                 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
3870                         }
3871
3872                 if (context_has_no_new_privileges(context))
3873                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
3874                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
3875                                 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
3876                         }
3877
3878 #if HAVE_SECCOMP
3879                 r = apply_address_families(unit, context);
3880                 if (r < 0) {
3881                         *exit_status = EXIT_ADDRESS_FAMILIES;
3882                         return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
3883                 }
3884
3885                 r = apply_memory_deny_write_execute(unit, context);
3886                 if (r < 0) {
3887                         *exit_status = EXIT_SECCOMP;
3888                         return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
3889                 }
3890
3891                 r = apply_restrict_realtime(unit, context);
3892                 if (r < 0) {
3893                         *exit_status = EXIT_SECCOMP;
3894                         return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
3895                 }
3896
3897                 r = apply_restrict_suid_sgid(unit, context);
3898                 if (r < 0) {
3899                         *exit_status = EXIT_SECCOMP;
3900                         return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
3901                 }
3902
3903                 r = apply_restrict_namespaces(unit, context);
3904                 if (r < 0) {
3905                         *exit_status = EXIT_SECCOMP;
3906                         return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
3907                 }
3908
3909                 r = apply_protect_sysctl(unit, context);
3910                 if (r < 0) {
3911                         *exit_status = EXIT_SECCOMP;
3912                         return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
3913                 }
3914
3915                 r = apply_protect_kernel_modules(unit, context);
3916                 if (r < 0) {
3917                         *exit_status = EXIT_SECCOMP;
3918                         return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
3919                 }
3920
3921                 r = apply_protect_kernel_logs(unit, context);
3922                 if (r < 0) {
3923                         *exit_status = EXIT_SECCOMP;
3924                         return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
3925                 }
3926
3927                 r = apply_protect_clock(unit, context);
3928                 if (r < 0) {
3929                         *exit_status = EXIT_SECCOMP;
3930                         return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
3931                 }
3932
3933                 r = apply_private_devices(unit, context);
3934                 if (r < 0) {
3935                         *exit_status = EXIT_SECCOMP;
3936                         return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
3937                 }
3938
3939                 r = apply_syscall_archs(unit, context);
3940                 if (r < 0) {
3941                         *exit_status = EXIT_SECCOMP;
3942                         return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
3943                 }
3944
3945                 r = apply_lock_personality(unit, context);
3946                 if (r < 0) {
3947                         *exit_status = EXIT_SECCOMP;
3948                         return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
3949                 }
3950
3951                 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3952                  * by the filter as little as possible. */
3953                 r = apply_syscall_filter(unit, context, needs_ambient_hack);
3954                 if (r < 0) {
3955                         *exit_status = EXIT_SECCOMP;
3956                         return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
3957                 }
3958 #endif
3959         }
3960
3961         if (!strv_isempty(context->unset_environment)) {
3962                 char **ee = NULL;
3963
3964                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
3965                 if (!ee) {
3966                         *exit_status = EXIT_MEMORY;
3967                         return log_oom();
3968                 }
3969
3970                 strv_free_and_replace(accum_env, ee);
3971         }
3972
3973         if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
3974                 replaced_argv = replace_env_argv(command->argv, accum_env);
3975                 if (!replaced_argv) {
3976                         *exit_status = EXIT_MEMORY;
3977                         return log_oom();
3978                 }
3979                 final_argv = replaced_argv;
3980         } else
3981                 final_argv = command->argv;
3982
3983         if (DEBUG_LOGGING) {
3984                 _cleanup_free_ char *line;
3985
3986                 line = exec_command_line(final_argv);
3987                 if (line)
3988                         log_struct(LOG_DEBUG,
3989                                    "EXECUTABLE=%s", command->path,
3990                                    LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
3991                                    LOG_UNIT_ID(unit),
3992                                    LOG_UNIT_INVOCATION_ID(unit));
3993         }
3994
3995         if (exec_fd >= 0) {
3996                 uint8_t hot = 1;
3997
3998                 /* We have finished with all our initializations. Let's now let the manager know that. From this point
3999                  * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
4000
4001                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
4002                         *exit_status = EXIT_EXEC;
4003                         return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
4004                 }
4005         }
4006
4007         execve(command->path, final_argv, accum_env);
4008         r = -errno;
4009
4010         if (exec_fd >= 0) {
4011                 uint8_t hot = 0;
4012
4013                 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
4014                  * that POLLHUP on it no longer means execve() succeeded. */
4015
4016                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
4017                         *exit_status = EXIT_EXEC;
4018                         return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
4019                 }
4020         }
4021
4022         if (r == -ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
4023                 log_struct_errno(LOG_INFO, r,
4024                                  "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4025                                  LOG_UNIT_ID(unit),
4026                                  LOG_UNIT_INVOCATION_ID(unit),
4027                                  LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
4028                                                   command->path),
4029                                  "EXECUTABLE=%s", command->path);
4030                 return 0;
4031         }
4032
4033         *exit_status = EXIT_EXEC;
4034         return log_unit_error_errno(unit, r, "Failed to execute command: %m");
4035 }
4036
4037 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
4038 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
4039
4040 int exec_spawn(Unit *unit,
4041                ExecCommand *command,
4042                const ExecContext *context,
4043                const ExecParameters *params,
4044                ExecRuntime *runtime,
4045                DynamicCreds *dcreds,
4046                pid_t *ret) {
4047
4048         int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
4049         _cleanup_free_ char *subcgroup_path = NULL;
4050         _cleanup_strv_free_ char **files_env = NULL;
4051         size_t n_storage_fds = 0, n_socket_fds = 0;
4052         _cleanup_free_ char *line = NULL;
4053         pid_t pid;
4054
4055         assert(unit);
4056         assert(command);
4057         assert(context);
4058         assert(ret);
4059         assert(params);
4060         assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
4061
4062         if (context->std_input == EXEC_INPUT_SOCKET ||
4063             context->std_output == EXEC_OUTPUT_SOCKET ||
4064             context->std_error == EXEC_OUTPUT_SOCKET) {
4065
4066                 if (params->n_socket_fds > 1) {
4067                         log_unit_error(unit, "Got more than one socket.");
4068                         return -EINVAL;
4069                 }
4070
4071                 if (params->n_socket_fds == 0) {
4072                         log_unit_error(unit, "Got no socket.");
4073                         return -EINVAL;
4074                 }
4075
4076                 socket_fd = params->fds[0];
4077         } else {
4078                 socket_fd = -1;
4079                 fds = params->fds;
4080                 n_socket_fds = params->n_socket_fds;
4081                 n_storage_fds = params->n_storage_fds;
4082         }
4083
4084         r = exec_context_named_iofds(context, params, named_iofds);
4085         if (r < 0)
4086                 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
4087
4088         r = exec_context_load_environment(unit, context, &files_env);
4089         if (r < 0)
4090                 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
4091
4092         line = exec_command_line(command->argv);
4093         if (!line)
4094                 return log_oom();
4095
4096         log_struct(LOG_DEBUG,
4097                    LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
4098                    "EXECUTABLE=%s", command->path,
4099                    LOG_UNIT_ID(unit),
4100                    LOG_UNIT_INVOCATION_ID(unit));
4101
4102         if (params->cgroup_path) {
4103                 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
4104                 if (r < 0)
4105                         return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
4106                 if (r > 0) { /* We are using a child cgroup */
4107                         r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
4108                         if (r < 0)
4109                                 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
4110                 }
4111         }
4112
4113         pid = fork();
4114         if (pid < 0)
4115                 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
4116
4117         if (pid == 0) {
4118                 int exit_status = EXIT_SUCCESS;
4119
4120                 r = exec_child(unit,
4121                                command,
4122                                context,
4123                                params,
4124                                runtime,
4125                                dcreds,
4126                                socket_fd,
4127                                named_iofds,
4128                                fds,
4129                                n_socket_fds,
4130                                n_storage_fds,
4131                                files_env,
4132                                unit->manager->user_lookup_fds[1],
4133                                &exit_status);
4134
4135                 if (r < 0) {
4136                         const char *status =
4137                                 exit_status_to_string(exit_status,
4138                                                       EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
4139
4140                         log_struct_errno(LOG_ERR, r,
4141                                          "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4142                                          LOG_UNIT_ID(unit),
4143                                          LOG_UNIT_INVOCATION_ID(unit),
4144                                          LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
4145                                                           status, command->path),
4146                                          "EXECUTABLE=%s", command->path);
4147                 }
4148
4149                 _exit(exit_status);
4150         }
4151
4152         log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
4153
4154         /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
4155          * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
4156          * process will be killed too). */
4157         if (subcgroup_path)
4158                 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
4159
4160         exec_status_start(&command->exec_status, pid);
4161
4162         *ret = pid;
4163         return 0;
4164 }
4165
4166 void exec_context_init(ExecContext *c) {
4167         ExecDirectoryType i;
4168
4169         assert(c);
4170
4171         c->umask = 0022;
4172         c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
4173         c->cpu_sched_policy = SCHED_OTHER;
4174         c->syslog_priority = LOG_DAEMON|LOG_INFO;
4175         c->syslog_level_prefix = true;
4176         c->ignore_sigpipe = true;
4177         c->timer_slack_nsec = NSEC_INFINITY;
4178         c->personality = PERSONALITY_INVALID;
4179         for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
4180                 c->directories[i].mode = 0755;
4181         c->timeout_clean_usec = USEC_INFINITY;
4182         c->capability_bounding_set = CAP_ALL;
4183         assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
4184         c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
4185         c->log_level_max = -1;
4186         numa_policy_reset(&c->numa_policy);
4187 }
4188
4189 void exec_context_done(ExecContext *c) {
4190         ExecDirectoryType i;
4191         size_t l;
4192
4193         assert(c);
4194
4195         c->environment = strv_free(c->environment);
4196         c->environment_files = strv_free(c->environment_files);
4197         c->pass_environment = strv_free(c->pass_environment);
4198         c->unset_environment = strv_free(c->unset_environment);
4199
4200         rlimit_free_all(c->rlimit);
4201
4202         for (l = 0; l < 3; l++) {
4203                 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
4204                 c->stdio_file[l] = mfree(c->stdio_file[l]);
4205         }
4206
4207         c->working_directory = mfree(c->working_directory);
4208         c->root_directory = mfree(c->root_directory);
4209         c->root_image = mfree(c->root_image);
4210         c->root_hash = mfree(c->root_hash);
4211         c->root_hash_size = 0;
4212         c->root_hash_path = mfree(c->root_hash_path);
4213         c->root_hash_sig = mfree(c->root_hash_sig);
4214         c->root_hash_sig_size = 0;
4215         c->root_hash_sig_path = mfree(c->root_hash_sig_path);
4216         c->root_verity = mfree(c->root_verity);
4217         c->tty_path = mfree(c->tty_path);
4218         c->syslog_identifier = mfree(c->syslog_identifier);
4219         c->user = mfree(c->user);
4220         c->group = mfree(c->group);
4221
4222         c->supplementary_groups = strv_free(c->supplementary_groups);
4223
4224         c->pam_name = mfree(c->pam_name);
4225
4226         c->read_only_paths = strv_free(c->read_only_paths);
4227         c->read_write_paths = strv_free(c->read_write_paths);
4228         c->inaccessible_paths = strv_free(c->inaccessible_paths);
4229
4230         bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
4231         c->bind_mounts = NULL;
4232         c->n_bind_mounts = 0;
4233         temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
4234         c->temporary_filesystems = NULL;
4235         c->n_temporary_filesystems = 0;
4236
4237         cpu_set_reset(&c->cpu_set);
4238         numa_policy_reset(&c->numa_policy);
4239
4240         c->utmp_id = mfree(c->utmp_id);
4241         c->selinux_context = mfree(c->selinux_context);
4242         c->apparmor_profile = mfree(c->apparmor_profile);
4243         c->smack_process_label = mfree(c->smack_process_label);
4244
4245         c->syscall_filter = hashmap_free(c->syscall_filter);
4246         c->syscall_archs = set_free(c->syscall_archs);
4247         c->address_families = set_free(c->address_families);
4248
4249         for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
4250                 c->directories[i].paths = strv_free(c->directories[i].paths);
4251
4252         c->log_level_max = -1;
4253
4254         exec_context_free_log_extra_fields(c);
4255
4256         c->log_ratelimit_interval_usec = 0;
4257         c->log_ratelimit_burst = 0;
4258
4259         c->stdin_data = mfree(c->stdin_data);
4260         c->stdin_data_size = 0;
4261
4262         c->network_namespace_path = mfree(c->network_namespace_path);
4263
4264         c->log_namespace = mfree(c->log_namespace);
4265 }
4266
4267 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
4268         char **i;
4269
4270         assert(c);
4271
4272         if (!runtime_prefix)
4273                 return 0;
4274
4275         STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
4276                 _cleanup_free_ char *p;
4277
4278                 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
4279                         p = path_join(runtime_prefix, "private", *i);
4280                 else
4281                         p = path_join(runtime_prefix, *i);
4282                 if (!p)
4283                         return -ENOMEM;
4284
4285                 /* We execute this synchronously, since we need to be sure this is gone when we start the
4286                  * service next. */
4287                 (void) rm_rf(p, REMOVE_ROOT);
4288         }
4289
4290         return 0;
4291 }
4292
4293 static void exec_command_done(ExecCommand *c) {
4294         assert(c);
4295
4296         c->path = mfree(c->path);
4297         c->argv = strv_free(c->argv);
4298 }
4299
4300 void exec_command_done_array(ExecCommand *c, size_t n) {
4301         size_t i;
4302
4303         for (i = 0; i < n; i++)
4304                 exec_command_done(c+i);
4305 }
4306
4307 ExecCommand* exec_command_free_list(ExecCommand *c) {
4308         ExecCommand *i;
4309
4310         while ((i = c)) {
4311                 LIST_REMOVE(command, c, i);
4312                 exec_command_done(i);
4313                 free(i);
4314         }
4315
4316         return NULL;
4317 }
4318
4319 void exec_command_free_array(ExecCommand **c, size_t n) {
4320         size_t i;
4321
4322         for (i = 0; i < n; i++)
4323                 c[i] = exec_command_free_list(c[i]);
4324 }
4325
4326 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
4327         size_t i;
4328
4329         for (i = 0; i < n; i++)
4330                 exec_status_reset(&c[i].exec_status);
4331 }
4332
4333 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
4334         size_t i;
4335
4336         for (i = 0; i < n; i++) {
4337                 ExecCommand *z;
4338
4339                 LIST_FOREACH(command, z, c[i])
4340                         exec_status_reset(&z->exec_status);
4341         }
4342 }
4343
4344 typedef struct InvalidEnvInfo {
4345         const Unit *unit;
4346         const char *path;
4347 } InvalidEnvInfo;
4348
4349 static void invalid_env(const char *p, void *userdata) {
4350         InvalidEnvInfo *info = userdata;
4351
4352         log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
4353 }
4354
4355 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
4356         assert(c);
4357
4358         switch (fd_index) {
4359
4360         case STDIN_FILENO:
4361                 if (c->std_input != EXEC_INPUT_NAMED_FD)
4362                         return NULL;
4363
4364                 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
4365
4366         case STDOUT_FILENO:
4367                 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
4368                         return NULL;
4369
4370                 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
4371
4372         case STDERR_FILENO:
4373                 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
4374                         return NULL;
4375
4376                 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
4377
4378         default:
4379                 return NULL;
4380         }
4381 }
4382
4383 static int exec_context_named_iofds(
4384                 const ExecContext *c,
4385                 const ExecParameters *p,
4386                 int named_iofds[static 3]) {
4387
4388         size_t i, targets;
4389         const char* stdio_fdname[3];
4390         size_t n_fds;
4391
4392         assert(c);
4393         assert(p);
4394         assert(named_iofds);
4395
4396         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
4397                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
4398                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
4399
4400         for (i = 0; i < 3; i++)
4401                 stdio_fdname[i] = exec_context_fdname(c, i);
4402
4403         n_fds = p->n_storage_fds + p->n_socket_fds;
4404
4405         for (i = 0; i < n_fds  && targets > 0; i++)
4406                 if (named_iofds[STDIN_FILENO] < 0 &&
4407                     c->std_input == EXEC_INPUT_NAMED_FD &&
4408                     stdio_fdname[STDIN_FILENO] &&
4409                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
4410
4411                         named_iofds[STDIN_FILENO] = p->fds[i];
4412                         targets--;
4413
4414                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
4415                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
4416                            stdio_fdname[STDOUT_FILENO] &&
4417                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
4418
4419                         named_iofds[STDOUT_FILENO] = p->fds[i];
4420                         targets--;
4421
4422                 } else if (named_iofds[STDERR_FILENO] < 0 &&
4423                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
4424                            stdio_fdname[STDERR_FILENO] &&
4425                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
4426
4427                         named_iofds[STDERR_FILENO] = p->fds[i];
4428                         targets--;
4429                 }
4430
4431         return targets == 0 ? 0 : -ENOENT;
4432 }
4433
4434 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
4435         char **i, **r = NULL;
4436
4437         assert(c);
4438         assert(l);
4439
4440         STRV_FOREACH(i, c->environment_files) {
4441                 char *fn;
4442                 int k;
4443                 unsigned n;
4444                 bool ignore = false;
4445                 char **p;
4446                 _cleanup_globfree_ glob_t pglob = {};
4447
4448                 fn = *i;
4449
4450                 if (fn[0] == '-') {
4451                         ignore = true;
4452                         fn++;
4453                 }
4454
4455                 if (!path_is_absolute(fn)) {
4456                         if (ignore)
4457                                 continue;
4458
4459                         strv_free(r);
4460                         return -EINVAL;
4461                 }
4462
4463                 /* Filename supports globbing, take all matching files */
4464                 k = safe_glob(fn, 0, &pglob);
4465                 if (k < 0) {
4466                         if (ignore)
4467                                 continue;
4468
4469                         strv_free(r);
4470                         return k;
4471                 }
4472
4473                 /* When we don't match anything, -ENOENT should be returned */
4474                 assert(pglob.gl_pathc > 0);
4475
4476                 for (n = 0; n < pglob.gl_pathc; n++) {
4477                         k = load_env_file(NULL, pglob.gl_pathv[n], &p);
4478                         if (k < 0) {
4479                                 if (ignore)
4480                                         continue;
4481
4482                                 strv_free(r);
4483                                 return k;
4484                         }
4485                         /* Log invalid environment variables with filename */
4486                         if (p) {
4487                                 InvalidEnvInfo info = {
4488                                         .unit = unit,
4489                                         .path = pglob.gl_pathv[n]
4490                                 };
4491
4492                                 p = strv_env_clean_with_callback(p, invalid_env, &info);
4493                         }
4494
4495                         if (!r)
4496                                 r = p;
4497                         else {
4498                                 char **m;
4499
4500                                 m = strv_env_merge(2, r, p);
4501                                 strv_free(r);
4502                                 strv_free(p);
4503                                 if (!m)
4504                                         return -ENOMEM;
4505
4506                                 r = m;
4507                         }
4508                 }
4509         }
4510
4511         *l = r;
4512
4513         return 0;
4514 }
4515
4516 static bool tty_may_match_dev_console(const char *tty) {
4517         _cleanup_free_ char *resolved = NULL;
4518
4519         if (!tty)
4520                 return true;
4521
4522         tty = skip_dev_prefix(tty);
4523
4524         /* trivial identity? */
4525         if (streq(tty, "console"))
4526                 return true;
4527
4528         if (resolve_dev_console(&resolved) < 0)
4529                 return true; /* if we could not resolve, assume it may */
4530
4531         /* "tty0" means the active VC, so it may be the same sometimes */
4532         return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
4533 }
4534
4535 static bool exec_context_may_touch_tty(const ExecContext *ec) {
4536         assert(ec);
4537
4538         return ec->tty_reset ||
4539                 ec->tty_vhangup ||
4540                 ec->tty_vt_disallocate ||
4541                 is_terminal_input(ec->std_input) ||
4542                 is_terminal_output(ec->std_output) ||
4543                 is_terminal_output(ec->std_error);
4544 }
4545
4546 bool exec_context_may_touch_console(const ExecContext *ec) {
4547
4548         return exec_context_may_touch_tty(ec) &&
4549                tty_may_match_dev_console(exec_context_tty_path(ec));
4550 }
4551
4552 static void strv_fprintf(FILE *f, char **l) {
4553         char **g;
4554
4555         assert(f);
4556
4557         STRV_FOREACH(g, l)
4558                 fprintf(f, " %s", *g);
4559 }
4560
4561 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
4562         char **e, **d, buf_clean[FORMAT_TIMESPAN_MAX];
4563         ExecDirectoryType dt;
4564         unsigned i;
4565         int r;
4566
4567         assert(c);
4568         assert(f);
4569
4570         prefix = strempty(prefix);
4571
4572         fprintf(f,
4573                 "%sUMask: %04o\n"
4574                 "%sWorkingDirectory: %s\n"
4575                 "%sRootDirectory: %s\n"
4576                 "%sNonBlocking: %s\n"
4577                 "%sPrivateTmp: %s\n"
4578                 "%sPrivateDevices: %s\n"
4579                 "%sProtectKernelTunables: %s\n"
4580                 "%sProtectKernelModules: %s\n"
4581                 "%sProtectKernelLogs: %s\n"
4582                 "%sProtectClock: %s\n"
4583                 "%sProtectControlGroups: %s\n"
4584                 "%sPrivateNetwork: %s\n"
4585                 "%sPrivateUsers: %s\n"
4586                 "%sProtectHome: %s\n"
4587                 "%sProtectSystem: %s\n"
4588                 "%sMountAPIVFS: %s\n"
4589                 "%sIgnoreSIGPIPE: %s\n"
4590                 "%sMemoryDenyWriteExecute: %s\n"
4591                 "%sRestrictRealtime: %s\n"
4592                 "%sRestrictSUIDSGID: %s\n"
4593                 "%sKeyringMode: %s\n"
4594                 "%sProtectHostname: %s\n",
4595                 prefix, c->umask,
4596                 prefix, c->working_directory ? c->working_directory : "/",
4597                 prefix, c->root_directory ? c->root_directory : "/",
4598                 prefix, yes_no(c->non_blocking),
4599                 prefix, yes_no(c->private_tmp),
4600                 prefix, yes_no(c->private_devices),
4601                 prefix, yes_no(c->protect_kernel_tunables),
4602                 prefix, yes_no(c->protect_kernel_modules),
4603                 prefix, yes_no(c->protect_kernel_logs),
4604                 prefix, yes_no(c->protect_clock),
4605                 prefix, yes_no(c->protect_control_groups),
4606                 prefix, yes_no(c->private_network),
4607                 prefix, yes_no(c->private_users),
4608                 prefix, protect_home_to_string(c->protect_home),
4609                 prefix, protect_system_to_string(c->protect_system),
4610                 prefix, yes_no(c->mount_apivfs),
4611                 prefix, yes_no(c->ignore_sigpipe),
4612                 prefix, yes_no(c->memory_deny_write_execute),
4613                 prefix, yes_no(c->restrict_realtime),
4614                 prefix, yes_no(c->restrict_suid_sgid),
4615                 prefix, exec_keyring_mode_to_string(c->keyring_mode),
4616                 prefix, yes_no(c->protect_hostname));
4617
4618         if (c->root_image)
4619                 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
4620
4621         if (c->root_hash) {
4622                 _cleanup_free_ char *encoded = NULL;
4623                 encoded = hexmem(c->root_hash, c->root_hash_size);
4624                 if (encoded)
4625                         fprintf(f, "%sRootHash: %s\n", prefix, encoded);
4626         }
4627
4628         if (c->root_hash_path)
4629                 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
4630
4631         if (c->root_hash_sig) {
4632                 _cleanup_free_ char *encoded = NULL;
4633                 ssize_t len;
4634                 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
4635                 if (len)
4636                         fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
4637         }
4638
4639         if (c->root_hash_sig_path)
4640                 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
4641
4642         if (c->root_verity)
4643                 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
4644
4645         STRV_FOREACH(e, c->environment)
4646                 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
4647
4648         STRV_FOREACH(e, c->environment_files)
4649                 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
4650
4651         STRV_FOREACH(e, c->pass_environment)
4652                 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
4653
4654         STRV_FOREACH(e, c->unset_environment)
4655                 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
4656
4657         fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
4658
4659         for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4660                 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
4661
4662                 STRV_FOREACH(d, c->directories[dt].paths)
4663                         fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
4664         }
4665
4666         fprintf(f,
4667                 "%sTimeoutCleanSec: %s\n",
4668                 prefix, format_timespan(buf_clean, sizeof(buf_clean), c->timeout_clean_usec, USEC_PER_SEC));
4669
4670         if (c->nice_set)
4671                 fprintf(f,
4672                         "%sNice: %i\n",
4673                         prefix, c->nice);
4674
4675         if (c->oom_score_adjust_set)
4676                 fprintf(f,
4677                         "%sOOMScoreAdjust: %i\n",
4678                         prefix, c->oom_score_adjust);
4679
4680         if (c->coredump_filter_set)
4681                 fprintf(f,
4682                         "%sCoredumpFilter: 0x%"PRIx64"\n",
4683                         prefix, c->coredump_filter);
4684
4685         for (i = 0; i < RLIM_NLIMITS; i++)
4686                 if (c->rlimit[i]) {
4687                         fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
4688                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
4689                         fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
4690                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
4691                 }
4692
4693         if (c->ioprio_set) {
4694                 _cleanup_free_ char *class_str = NULL;
4695
4696                 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
4697                 if (r >= 0)
4698                         fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
4699
4700                 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
4701         }
4702
4703         if (c->cpu_sched_set) {
4704                 _cleanup_free_ char *policy_str = NULL;
4705
4706                 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
4707                 if (r >= 0)
4708                         fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
4709
4710                 fprintf(f,
4711                         "%sCPUSchedulingPriority: %i\n"
4712                         "%sCPUSchedulingResetOnFork: %s\n",
4713                         prefix, c->cpu_sched_priority,
4714                         prefix, yes_no(c->cpu_sched_reset_on_fork));
4715         }
4716
4717         if (c->cpu_set.set) {
4718                 _cleanup_free_ char *affinity = NULL;
4719
4720                 affinity = cpu_set_to_range_string(&c->cpu_set);
4721                 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
4722         }
4723
4724         if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
4725                 _cleanup_free_ char *nodes = NULL;
4726
4727                 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
4728                 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
4729                 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
4730         }
4731
4732         if (c->timer_slack_nsec != NSEC_INFINITY)
4733                 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
4734
4735         fprintf(f,
4736                 "%sStandardInput: %s\n"
4737                 "%sStandardOutput: %s\n"
4738                 "%sStandardError: %s\n",
4739                 prefix, exec_input_to_string(c->std_input),
4740                 prefix, exec_output_to_string(c->std_output),
4741                 prefix, exec_output_to_string(c->std_error));
4742
4743         if (c->std_input == EXEC_INPUT_NAMED_FD)
4744                 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
4745         if (c->std_output == EXEC_OUTPUT_NAMED_FD)
4746                 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
4747         if (c->std_error == EXEC_OUTPUT_NAMED_FD)
4748                 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
4749
4750         if (c->std_input == EXEC_INPUT_FILE)
4751                 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
4752         if (c->std_output == EXEC_OUTPUT_FILE)
4753                 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4754         if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
4755                 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4756         if (c->std_error == EXEC_OUTPUT_FILE)
4757                 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4758         if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
4759                 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4760
4761         if (c->tty_path)
4762                 fprintf(f,
4763                         "%sTTYPath: %s\n"
4764                         "%sTTYReset: %s\n"
4765                         "%sTTYVHangup: %s\n"
4766                         "%sTTYVTDisallocate: %s\n",
4767                         prefix, c->tty_path,
4768                         prefix, yes_no(c->tty_reset),
4769                         prefix, yes_no(c->tty_vhangup),
4770                         prefix, yes_no(c->tty_vt_disallocate));
4771
4772         if (IN_SET(c->std_output,
4773                    EXEC_OUTPUT_KMSG,
4774                    EXEC_OUTPUT_JOURNAL,
4775                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
4776                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
4777             IN_SET(c->std_error,
4778                    EXEC_OUTPUT_KMSG,
4779                    EXEC_OUTPUT_JOURNAL,
4780                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
4781                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
4782
4783                 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
4784
4785                 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
4786                 if (r >= 0)
4787                         fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
4788
4789                 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
4790                 if (r >= 0)
4791                         fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
4792         }
4793
4794         if (c->log_level_max >= 0) {
4795                 _cleanup_free_ char *t = NULL;
4796
4797                 (void) log_level_to_string_alloc(c->log_level_max, &t);
4798
4799                 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
4800         }
4801
4802         if (c->log_ratelimit_interval_usec > 0) {
4803                 char buf_timespan[FORMAT_TIMESPAN_MAX];
4804
4805                 fprintf(f,
4806                         "%sLogRateLimitIntervalSec: %s\n",
4807                         prefix, format_timespan(buf_timespan, sizeof(buf_timespan), c->log_ratelimit_interval_usec, USEC_PER_SEC));
4808         }
4809
4810         if (c->log_ratelimit_burst > 0)
4811                 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
4812
4813         if (c->n_log_extra_fields > 0) {
4814                 size_t j;
4815
4816                 for (j = 0; j < c->n_log_extra_fields; j++) {
4817                         fprintf(f, "%sLogExtraFields: ", prefix);
4818                         fwrite(c->log_extra_fields[j].iov_base,
4819                                1, c->log_extra_fields[j].iov_len,
4820                                f);
4821                         fputc('\n', f);
4822                 }
4823         }
4824
4825         if (c->log_namespace)
4826                 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
4827
4828         if (c->secure_bits) {
4829                 _cleanup_free_ char *str = NULL;
4830
4831                 r = secure_bits_to_string_alloc(c->secure_bits, &str);
4832                 if (r >= 0)
4833                         fprintf(f, "%sSecure Bits: %s\n", prefix, str);
4834         }
4835
4836         if (c->capability_bounding_set != CAP_ALL) {
4837                 _cleanup_free_ char *str = NULL;
4838
4839                 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
4840                 if (r >= 0)
4841                         fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
4842         }
4843
4844         if (c->capability_ambient_set != 0) {
4845                 _cleanup_free_ char *str = NULL;
4846
4847                 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
4848                 if (r >= 0)
4849                         fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
4850         }
4851
4852         if (c->user)
4853                 fprintf(f, "%sUser: %s\n", prefix, c->user);
4854         if (c->group)
4855                 fprintf(f, "%sGroup: %s\n", prefix, c->group);
4856
4857         fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
4858
4859         if (!strv_isempty(c->supplementary_groups)) {
4860                 fprintf(f, "%sSupplementaryGroups:", prefix);
4861                 strv_fprintf(f, c->supplementary_groups);
4862                 fputs("\n", f);
4863         }
4864
4865         if (c->pam_name)
4866                 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
4867
4868         if (!strv_isempty(c->read_write_paths)) {
4869                 fprintf(f, "%sReadWritePaths:", prefix);
4870                 strv_fprintf(f, c->read_write_paths);
4871                 fputs("\n", f);
4872         }
4873
4874         if (!strv_isempty(c->read_only_paths)) {
4875                 fprintf(f, "%sReadOnlyPaths:", prefix);
4876                 strv_fprintf(f, c->read_only_paths);
4877                 fputs("\n", f);
4878         }
4879
4880         if (!strv_isempty(c->inaccessible_paths)) {
4881                 fprintf(f, "%sInaccessiblePaths:", prefix);
4882                 strv_fprintf(f, c->inaccessible_paths);
4883                 fputs("\n", f);
4884         }
4885
4886         if (c->n_bind_mounts > 0)
4887                 for (i = 0; i < c->n_bind_mounts; i++)
4888                         fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
4889                                 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
4890                                 c->bind_mounts[i].ignore_enoent ? "-": "",
4891                                 c->bind_mounts[i].source,
4892                                 c->bind_mounts[i].destination,
4893                                 c->bind_mounts[i].recursive ? "rbind" : "norbind");
4894
4895         if (c->n_temporary_filesystems > 0)
4896                 for (i = 0; i < c->n_temporary_filesystems; i++) {
4897                         TemporaryFileSystem *t = c->temporary_filesystems + i;
4898
4899                         fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
4900                                 t->path,
4901                                 isempty(t->options) ? "" : ":",
4902                                 strempty(t->options));
4903                 }
4904
4905         if (c->utmp_id)
4906                 fprintf(f,
4907                         "%sUtmpIdentifier: %s\n",
4908                         prefix, c->utmp_id);
4909
4910         if (c->selinux_context)
4911                 fprintf(f,
4912                         "%sSELinuxContext: %s%s\n",
4913                         prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
4914
4915         if (c->apparmor_profile)
4916                 fprintf(f,
4917                         "%sAppArmorProfile: %s%s\n",
4918                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4919
4920         if (c->smack_process_label)
4921                 fprintf(f,
4922                         "%sSmackProcessLabel: %s%s\n",
4923                         prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
4924
4925         if (c->personality != PERSONALITY_INVALID)
4926                 fprintf(f,
4927                         "%sPersonality: %s\n",
4928                         prefix, strna(personality_to_string(c->personality)));
4929
4930         fprintf(f,
4931                 "%sLockPersonality: %s\n",
4932                 prefix, yes_no(c->lock_personality));
4933
4934         if (c->syscall_filter) {
4935 #if HAVE_SECCOMP
4936                 Iterator j;
4937                 void *id, *val;
4938                 bool first = true;
4939 #endif
4940
4941                 fprintf(f,
4942                         "%sSystemCallFilter: ",
4943                         prefix);
4944
4945                 if (!c->syscall_allow_list)
4946                         fputc('~', f);
4947
4948 #if HAVE_SECCOMP
4949                 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter, j) {
4950                         _cleanup_free_ char *name = NULL;
4951                         const char *errno_name = NULL;
4952                         int num = PTR_TO_INT(val);
4953
4954                         if (first)
4955                                 first = false;
4956                         else
4957                                 fputc(' ', f);
4958
4959                         name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
4960                         fputs(strna(name), f);
4961
4962                         if (num >= 0) {
4963                                 errno_name = errno_to_name(num);
4964                                 if (errno_name)
4965                                         fprintf(f, ":%s", errno_name);
4966                                 else
4967                                         fprintf(f, ":%d", num);
4968                         }
4969                 }
4970 #endif
4971
4972                 fputc('\n', f);
4973         }
4974
4975         if (c->syscall_archs) {
4976 #if HAVE_SECCOMP
4977                 Iterator j;
4978                 void *id;
4979 #endif
4980
4981                 fprintf(f,
4982                         "%sSystemCallArchitectures:",
4983                         prefix);
4984
4985 #if HAVE_SECCOMP
4986                 SET_FOREACH(id, c->syscall_archs, j)
4987                         fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
4988 #endif
4989                 fputc('\n', f);
4990         }
4991
4992         if (exec_context_restrict_namespaces_set(c)) {
4993                 _cleanup_free_ char *s = NULL;
4994
4995                 r = namespace_flags_to_string(c->restrict_namespaces, &s);
4996                 if (r >= 0)
4997                         fprintf(f, "%sRestrictNamespaces: %s\n",
4998                                 prefix, strna(s));
4999         }
5000
5001         if (c->network_namespace_path)
5002                 fprintf(f,
5003                         "%sNetworkNamespacePath: %s\n",
5004                         prefix, c->network_namespace_path);
5005
5006         if (c->syscall_errno > 0) {
5007                 const char *errno_name;
5008
5009                 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
5010
5011                 errno_name = errno_to_name(c->syscall_errno);
5012                 if (errno_name)
5013                         fprintf(f, "%s\n", errno_name);
5014                 else
5015                         fprintf(f, "%d\n", c->syscall_errno);
5016         }
5017 }
5018
5019 bool exec_context_maintains_privileges(const ExecContext *c) {
5020         assert(c);
5021
5022         /* Returns true if the process forked off would run under
5023          * an unchanged UID or as root. */
5024
5025         if (!c->user)
5026                 return true;
5027
5028         if (streq(c->user, "root") || streq(c->user, "0"))
5029                 return true;
5030
5031         return false;
5032 }
5033
5034 int exec_context_get_effective_ioprio(const ExecContext *c) {
5035         int p;
5036
5037         assert(c);
5038
5039         if (c->ioprio_set)
5040                 return c->ioprio;
5041
5042         p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
5043         if (p < 0)
5044                 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
5045
5046         return p;
5047 }
5048
5049 void exec_context_free_log_extra_fields(ExecContext *c) {
5050         size_t l;
5051
5052         assert(c);
5053
5054         for (l = 0; l < c->n_log_extra_fields; l++)
5055                 free(c->log_extra_fields[l].iov_base);
5056         c->log_extra_fields = mfree(c->log_extra_fields);
5057         c->n_log_extra_fields = 0;
5058 }
5059
5060 void exec_context_revert_tty(ExecContext *c) {
5061         int r;
5062
5063         assert(c);
5064
5065         /* First, reset the TTY (possibly kicking everybody else from the TTY) */
5066         exec_context_tty_reset(c, NULL);
5067
5068         /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
5069          * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
5070          * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
5071
5072         if (exec_context_may_touch_tty(c)) {
5073                 const char *path;
5074
5075                 path = exec_context_tty_path(c);
5076                 if (path) {
5077                         r = chmod_and_chown(path, TTY_MODE, 0, TTY_GID);
5078                         if (r < 0 && r != -ENOENT)
5079                                 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
5080                 }
5081         }
5082 }
5083
5084 int exec_context_get_clean_directories(
5085                 ExecContext *c,
5086                 char **prefix,
5087                 ExecCleanMask mask,
5088                 char ***ret) {
5089
5090         _cleanup_strv_free_ char **l = NULL;
5091         ExecDirectoryType t;
5092         int r;
5093
5094         assert(c);
5095         assert(prefix);
5096         assert(ret);
5097
5098         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
5099                 char **i;
5100
5101                 if (!FLAGS_SET(mask, 1U << t))
5102                         continue;
5103
5104                 if (!prefix[t])
5105                         continue;
5106
5107                 STRV_FOREACH(i, c->directories[t].paths) {
5108                         char *j;
5109
5110                         j = path_join(prefix[t], *i);
5111                         if (!j)
5112                                 return -ENOMEM;
5113
5114                         r = strv_consume(&l, j);
5115                         if (r < 0)
5116                                 return r;
5117
5118                         /* Also remove private directories unconditionally. */
5119                         if (t != EXEC_DIRECTORY_CONFIGURATION) {
5120                                 j = path_join(prefix[t], "private", *i);
5121                                 if (!j)
5122                                         return -ENOMEM;
5123
5124                                 r = strv_consume(&l, j);
5125                                 if (r < 0)
5126                                         return r;
5127                         }
5128                 }
5129         }
5130
5131         *ret = TAKE_PTR(l);
5132         return 0;
5133 }
5134
5135 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
5136         ExecCleanMask mask = 0;
5137
5138         assert(c);
5139         assert(ret);
5140
5141         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5142                 if (!strv_isempty(c->directories[t].paths))
5143                         mask |= 1U << t;
5144
5145         *ret = mask;
5146         return 0;
5147 }
5148
5149 void exec_status_start(ExecStatus *s, pid_t pid) {
5150         assert(s);
5151
5152         *s = (ExecStatus) {
5153                 .pid = pid,
5154         };
5155
5156         dual_timestamp_get(&s->start_timestamp);
5157 }
5158
5159 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
5160         assert(s);
5161
5162         if (s->pid != pid) {
5163                 *s = (ExecStatus) {
5164                         .pid = pid,
5165                 };
5166         }
5167
5168         dual_timestamp_get(&s->exit_timestamp);
5169
5170         s->code = code;
5171         s->status = status;
5172
5173         if (context && context->utmp_id)
5174                 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
5175 }
5176
5177 void exec_status_reset(ExecStatus *s) {
5178         assert(s);
5179
5180         *s = (ExecStatus) {};
5181 }
5182
5183 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
5184         char buf[FORMAT_TIMESTAMP_MAX];
5185
5186         assert(s);
5187         assert(f);
5188
5189         if (s->pid <= 0)
5190                 return;
5191
5192         prefix = strempty(prefix);
5193
5194         fprintf(f,
5195                 "%sPID: "PID_FMT"\n",
5196                 prefix, s->pid);
5197
5198         if (dual_timestamp_is_set(&s->start_timestamp))
5199                 fprintf(f,
5200                         "%sStart Timestamp: %s\n",
5201                         prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
5202
5203         if (dual_timestamp_is_set(&s->exit_timestamp))
5204                 fprintf(f,
5205                         "%sExit Timestamp: %s\n"
5206                         "%sExit Code: %s\n"
5207                         "%sExit Status: %i\n",
5208                         prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
5209                         prefix, sigchld_code_to_string(s->code),
5210                         prefix, s->status);
5211 }
5212
5213 static char *exec_command_line(char **argv) {
5214         size_t k;
5215         char *n, *p, **a;
5216         bool first = true;
5217
5218         assert(argv);
5219
5220         k = 1;
5221         STRV_FOREACH(a, argv)
5222                 k += strlen(*a)+3;
5223
5224         n = new(char, k);
5225         if (!n)
5226                 return NULL;
5227
5228         p = n;
5229         STRV_FOREACH(a, argv) {
5230
5231                 if (!first)
5232                         *(p++) = ' ';
5233                 else
5234                         first = false;
5235
5236                 if (strpbrk(*a, WHITESPACE)) {
5237                         *(p++) = '\'';
5238                         p = stpcpy(p, *a);
5239                         *(p++) = '\'';
5240                 } else
5241                         p = stpcpy(p, *a);
5242
5243         }
5244
5245         *p = 0;
5246
5247         /* FIXME: this doesn't really handle arguments that have
5248          * spaces and ticks in them */
5249
5250         return n;
5251 }
5252
5253 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
5254         _cleanup_free_ char *cmd = NULL;
5255         const char *prefix2;
5256
5257         assert(c);
5258         assert(f);
5259
5260         prefix = strempty(prefix);
5261         prefix2 = strjoina(prefix, "\t");
5262
5263         cmd = exec_command_line(c->argv);
5264         fprintf(f,
5265                 "%sCommand Line: %s\n",
5266                 prefix, cmd ? cmd : strerror_safe(ENOMEM));
5267
5268         exec_status_dump(&c->exec_status, f, prefix2);
5269 }
5270
5271 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
5272         assert(f);
5273
5274         prefix = strempty(prefix);
5275
5276         LIST_FOREACH(command, c, c)
5277                 exec_command_dump(c, f, prefix);
5278 }
5279
5280 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
5281         ExecCommand *end;
5282
5283         assert(l);
5284         assert(e);
5285
5286         if (*l) {
5287                 /* It's kind of important, that we keep the order here */
5288                 LIST_FIND_TAIL(command, *l, end);
5289                 LIST_INSERT_AFTER(command, *l, end, e);
5290         } else
5291               *l = e;
5292 }
5293
5294 int exec_command_set(ExecCommand *c, const char *path, ...) {
5295         va_list ap;
5296         char **l, *p;
5297
5298         assert(c);
5299         assert(path);
5300
5301         va_start(ap, path);
5302         l = strv_new_ap(path, ap);
5303         va_end(ap);
5304
5305         if (!l)
5306                 return -ENOMEM;
5307
5308         p = strdup(path);
5309         if (!p) {
5310                 strv_free(l);
5311                 return -ENOMEM;
5312         }
5313
5314         free_and_replace(c->path, p);
5315
5316         return strv_free_and_replace(c->argv, l);
5317 }
5318
5319 int exec_command_append(ExecCommand *c, const char *path, ...) {
5320         _cleanup_strv_free_ char **l = NULL;
5321         va_list ap;
5322         int r;
5323
5324         assert(c);
5325         assert(path);
5326
5327         va_start(ap, path);
5328         l = strv_new_ap(path, ap);
5329         va_end(ap);
5330
5331         if (!l)
5332                 return -ENOMEM;
5333
5334         r = strv_extend_strv(&c->argv, l, false);
5335         if (r < 0)
5336                 return r;
5337
5338         return 0;
5339 }
5340
5341 static void *remove_tmpdir_thread(void *p) {
5342         _cleanup_free_ char *path = p;
5343
5344         (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
5345         return NULL;
5346 }
5347
5348 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
5349         int r;
5350
5351         if (!rt)
5352                 return NULL;
5353
5354         if (rt->manager)
5355                 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
5356
5357         /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
5358
5359         if (destroy && rt->tmp_dir && !streq(rt->tmp_dir, RUN_SYSTEMD_EMPTY)) {
5360                 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
5361
5362                 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
5363                 if (r < 0)
5364                         log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
5365                 else
5366                         rt->tmp_dir = NULL;
5367         }
5368
5369         if (destroy && rt->var_tmp_dir && !streq(rt->var_tmp_dir, RUN_SYSTEMD_EMPTY)) {
5370                 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
5371
5372                 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
5373                 if (r < 0)
5374                         log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
5375                 else
5376                         rt->var_tmp_dir = NULL;
5377         }
5378
5379         rt->id = mfree(rt->id);
5380         rt->tmp_dir = mfree(rt->tmp_dir);
5381         rt->var_tmp_dir = mfree(rt->var_tmp_dir);
5382         safe_close_pair(rt->netns_storage_socket);
5383         return mfree(rt);
5384 }
5385
5386 static void exec_runtime_freep(ExecRuntime **rt) {
5387         (void) exec_runtime_free(*rt, false);
5388 }
5389
5390 static int exec_runtime_allocate(ExecRuntime **ret, const char *id) {
5391         _cleanup_free_ char *id_copy = NULL;
5392         ExecRuntime *n;
5393
5394         assert(ret);
5395
5396         id_copy = strdup(id);
5397         if (!id_copy)
5398                 return -ENOMEM;
5399
5400         n = new(ExecRuntime, 1);
5401         if (!n)
5402                 return -ENOMEM;
5403
5404         *n = (ExecRuntime) {
5405                 .id = TAKE_PTR(id_copy),
5406                 .netns_storage_socket = { -1, -1 },
5407         };
5408
5409         *ret = n;
5410         return 0;
5411 }
5412
5413 static int exec_runtime_add(
5414                 Manager *m,
5415                 const char *id,
5416                 char **tmp_dir,
5417                 char **var_tmp_dir,
5418                 int netns_storage_socket[2],
5419                 ExecRuntime **ret) {
5420
5421         _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
5422         int r;
5423
5424         assert(m);
5425         assert(id);
5426
5427         /* tmp_dir, var_tmp_dir, netns_storage_socket fds are donated on success */
5428
5429         r = hashmap_ensure_allocated(&m->exec_runtime_by_id, &string_hash_ops);
5430         if (r < 0)
5431                 return r;
5432
5433         r = exec_runtime_allocate(&rt, id);
5434         if (r < 0)
5435                 return r;
5436
5437         r = hashmap_put(m->exec_runtime_by_id, rt->id, rt);
5438         if (r < 0)
5439                 return r;
5440
5441         assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
5442         rt->tmp_dir = TAKE_PTR(*tmp_dir);
5443         rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
5444
5445         if (netns_storage_socket) {
5446                 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
5447                 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
5448         }
5449
5450         rt->manager = m;
5451
5452         if (ret)
5453                 *ret = rt;
5454         /* do not remove created ExecRuntime object when the operation succeeds. */
5455         TAKE_PTR(rt);
5456         return 0;
5457 }
5458
5459 static int exec_runtime_make(Manager *m, const ExecContext *c, const char *id, ExecRuntime **ret) {
5460         _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
5461         _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 };
5462         int r;
5463
5464         assert(m);
5465         assert(c);
5466         assert(id);
5467
5468         /* It is not necessary to create ExecRuntime object. */
5469         if (!c->private_network && !c->private_tmp && !c->network_namespace_path)
5470                 return 0;
5471
5472         if (c->private_tmp &&
5473             !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
5474               (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
5475                prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
5476                 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
5477                 if (r < 0)
5478                         return r;
5479         }
5480
5481         if (c->private_network || c->network_namespace_path) {
5482                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
5483                         return -errno;
5484         }
5485
5486         r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ret);
5487         if (r < 0)
5488                 return r;
5489
5490         return 1;
5491 }
5492
5493 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
5494         ExecRuntime *rt;
5495         int r;
5496
5497         assert(m);
5498         assert(id);
5499         assert(ret);
5500
5501         rt = hashmap_get(m->exec_runtime_by_id, id);
5502         if (rt)
5503                 /* We already have a ExecRuntime object, let's increase the ref count and reuse it */
5504                 goto ref;
5505
5506         if (!create)
5507                 return 0;
5508
5509         /* If not found, then create a new object. */
5510         r = exec_runtime_make(m, c, id, &rt);
5511         if (r <= 0)
5512                 /* When r == 0, it is not necessary to create ExecRuntime object. */
5513                 return r;
5514
5515 ref:
5516         /* increment reference counter. */
5517         rt->n_ref++;
5518         *ret = rt;
5519         return 1;
5520 }
5521
5522 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
5523         if (!rt)
5524                 return NULL;
5525
5526         assert(rt->n_ref > 0);
5527
5528         rt->n_ref--;
5529         if (rt->n_ref > 0)
5530                 return NULL;
5531
5532         return exec_runtime_free(rt, destroy);
5533 }
5534
5535 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
5536         ExecRuntime *rt;
5537         Iterator i;
5538
5539         assert(m);
5540         assert(f);
5541         assert(fds);
5542
5543         HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
5544                 fprintf(f, "exec-runtime=%s", rt->id);
5545
5546                 if (rt->tmp_dir)
5547                         fprintf(f, " tmp-dir=%s", rt->tmp_dir);
5548
5549                 if (rt->var_tmp_dir)
5550                         fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
5551
5552                 if (rt->netns_storage_socket[0] >= 0) {
5553                         int copy;
5554
5555                         copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
5556                         if (copy < 0)
5557                                 return copy;
5558
5559                         fprintf(f, " netns-socket-0=%i", copy);
5560                 }
5561
5562                 if (rt->netns_storage_socket[1] >= 0) {
5563                         int copy;
5564
5565                         copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
5566                         if (copy < 0)
5567                                 return copy;
5568
5569                         fprintf(f, " netns-socket-1=%i", copy);
5570                 }
5571
5572                 fputc('\n', f);
5573         }
5574
5575         return 0;
5576 }
5577
5578 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
5579         _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
5580         ExecRuntime *rt;
5581         int r;
5582
5583         /* This is for the migration from old (v237 or earlier) deserialization text.
5584          * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
5585          * Even if the ExecRuntime object originally created by the other unit, we cannot judge
5586          * so or not from the serialized text, then we always creates a new object owned by this. */
5587
5588         assert(u);
5589         assert(key);
5590         assert(value);
5591
5592         /* Manager manages ExecRuntime objects by the unit id.
5593          * So, we omit the serialized text when the unit does not have id (yet?)... */
5594         if (isempty(u->id)) {
5595                 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
5596                 return 0;
5597         }
5598
5599         r = hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops);
5600         if (r < 0) {
5601                 log_unit_debug_errno(u, r, "Failed to allocate storage for runtime parameter: %m");
5602                 return 0;
5603         }
5604
5605         rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
5606         if (!rt) {
5607                 r = exec_runtime_allocate(&rt_create, u->id);
5608                 if (r < 0)
5609                         return log_oom();
5610
5611                 rt = rt_create;
5612         }
5613
5614         if (streq(key, "tmp-dir")) {
5615                 char *copy;
5616
5617                 copy = strdup(value);
5618                 if (!copy)
5619                         return log_oom();
5620
5621                 free_and_replace(rt->tmp_dir, copy);
5622
5623         } else if (streq(key, "var-tmp-dir")) {
5624                 char *copy;
5625
5626                 copy = strdup(value);
5627                 if (!copy)
5628                         return log_oom();
5629
5630                 free_and_replace(rt->var_tmp_dir, copy);
5631
5632         } else if (streq(key, "netns-socket-0")) {
5633                 int fd;
5634
5635                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
5636                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
5637                         return 0;
5638                 }
5639
5640                 safe_close(rt->netns_storage_socket[0]);
5641                 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
5642
5643         } else if (streq(key, "netns-socket-1")) {
5644                 int fd;
5645
5646                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
5647                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
5648                         return 0;
5649                 }
5650
5651                 safe_close(rt->netns_storage_socket[1]);
5652                 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
5653         } else
5654                 return 0;
5655
5656         /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
5657         if (rt_create) {
5658                 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
5659                 if (r < 0) {
5660                         log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
5661                         return 0;
5662                 }
5663
5664                 rt_create->manager = u->manager;
5665
5666                 /* Avoid cleanup */
5667                 TAKE_PTR(rt_create);
5668         }
5669
5670         return 1;
5671 }
5672
5673 int exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
5674         _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
5675         char *id = NULL;
5676         int r, fdpair[] = {-1, -1};
5677         const char *p, *v = value;
5678         size_t n;
5679
5680         assert(m);
5681         assert(value);
5682         assert(fds);
5683
5684         n = strcspn(v, " ");
5685         id = strndupa(v, n);
5686         if (v[n] != ' ')
5687                 goto finalize;
5688         p = v + n + 1;
5689
5690         v = startswith(p, "tmp-dir=");
5691         if (v) {
5692                 n = strcspn(v, " ");
5693                 tmp_dir = strndup(v, n);
5694                 if (!tmp_dir)
5695                         return log_oom();
5696                 if (v[n] != ' ')
5697                         goto finalize;
5698                 p = v + n + 1;
5699         }
5700
5701         v = startswith(p, "var-tmp-dir=");
5702         if (v) {
5703                 n = strcspn(v, " ");
5704                 var_tmp_dir = strndup(v, n);
5705                 if (!var_tmp_dir)
5706                         return log_oom();
5707                 if (v[n] != ' ')
5708                         goto finalize;
5709                 p = v + n + 1;
5710         }
5711
5712         v = startswith(p, "netns-socket-0=");
5713         if (v) {
5714                 char *buf;
5715
5716                 n = strcspn(v, " ");
5717                 buf = strndupa(v, n);
5718                 if (safe_atoi(buf, &fdpair[0]) < 0 || !fdset_contains(fds, fdpair[0]))
5719                         return log_debug("Unable to process exec-runtime netns fd specification.");
5720                 fdpair[0] = fdset_remove(fds, fdpair[0]);
5721                 if (v[n] != ' ')
5722                         goto finalize;
5723                 p = v + n + 1;
5724         }
5725
5726         v = startswith(p, "netns-socket-1=");
5727         if (v) {
5728                 char *buf;
5729
5730                 n = strcspn(v, " ");
5731                 buf = strndupa(v, n);
5732                 if (safe_atoi(buf, &fdpair[1]) < 0 || !fdset_contains(fds, fdpair[1]))
5733                         return log_debug("Unable to process exec-runtime netns fd specification.");
5734                 fdpair[1] = fdset_remove(fds, fdpair[1]);
5735         }
5736
5737 finalize:
5738         r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, fdpair, NULL);
5739         if (r < 0)
5740                 return log_debug_errno(r, "Failed to add exec-runtime: %m");
5741         return 0;
5742 }
5743
5744 void exec_runtime_vacuum(Manager *m) {
5745         ExecRuntime *rt;
5746         Iterator i;
5747
5748         assert(m);
5749
5750         /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
5751
5752         HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
5753                 if (rt->n_ref > 0)
5754                         continue;
5755
5756                 (void) exec_runtime_free(rt, false);
5757         }
5758 }
5759
5760 void exec_params_clear(ExecParameters *p) {
5761         if (!p)
5762                 return;
5763
5764         strv_free(p->environment);
5765 }
5766
5767 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
5768         [EXEC_INPUT_NULL] = "null",
5769         [EXEC_INPUT_TTY] = "tty",
5770         [EXEC_INPUT_TTY_FORCE] = "tty-force",
5771         [EXEC_INPUT_TTY_FAIL] = "tty-fail",
5772         [EXEC_INPUT_SOCKET] = "socket",
5773         [EXEC_INPUT_NAMED_FD] = "fd",
5774         [EXEC_INPUT_DATA] = "data",
5775         [EXEC_INPUT_FILE] = "file",
5776 };
5777
5778 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
5779
5780 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
5781         [EXEC_OUTPUT_INHERIT] = "inherit",
5782         [EXEC_OUTPUT_NULL] = "null",
5783         [EXEC_OUTPUT_TTY] = "tty",
5784         [EXEC_OUTPUT_KMSG] = "kmsg",
5785         [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
5786         [EXEC_OUTPUT_JOURNAL] = "journal",
5787         [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
5788         [EXEC_OUTPUT_SOCKET] = "socket",
5789         [EXEC_OUTPUT_NAMED_FD] = "fd",
5790         [EXEC_OUTPUT_FILE] = "file",
5791         [EXEC_OUTPUT_FILE_APPEND] = "append",
5792 };
5793
5794 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
5795
5796 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
5797         [EXEC_UTMP_INIT] = "init",
5798         [EXEC_UTMP_LOGIN] = "login",
5799         [EXEC_UTMP_USER] = "user",
5800 };
5801
5802 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
5803
5804 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
5805         [EXEC_PRESERVE_NO] = "no",
5806         [EXEC_PRESERVE_YES] = "yes",
5807         [EXEC_PRESERVE_RESTART] = "restart",
5808 };
5809
5810 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
5811
5812 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
5813 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5814         [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
5815         [EXEC_DIRECTORY_STATE] = "StateDirectory",
5816         [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
5817         [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
5818         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
5819 };
5820
5821 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
5822
5823 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
5824  * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
5825  * directories, specifically .timer units with their timestamp touch file. */
5826 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5827         [EXEC_DIRECTORY_RUNTIME] = "runtime",
5828         [EXEC_DIRECTORY_STATE] = "state",
5829         [EXEC_DIRECTORY_CACHE] = "cache",
5830         [EXEC_DIRECTORY_LOGS] = "logs",
5831         [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
5832 };
5833
5834 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
5835
5836 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
5837  * the service payload in. */
5838 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5839         [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
5840         [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
5841         [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
5842         [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
5843         [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
5844 };
5845
5846 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
5847
5848 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
5849         [EXEC_KEYRING_INHERIT] = "inherit",
5850         [EXEC_KEYRING_PRIVATE] = "private",
5851         [EXEC_KEYRING_SHARED] = "shared",
5852 };
5853
5854 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);