src/core/execute.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2
   3 #include <errno.h>
   4 #include <fcntl.h>
   5 #include <poll.h>
   6 #include <sys/eventfd.h>
   7 #include <sys/ioctl.h>
   8 #include <sys/mman.h>
   9 #include <sys/personality.h>
  10 #include <sys/prctl.h>
  11 #include <sys/shm.h>
  12 #include <sys/types.h>
  13 #include <sys/un.h>
  14 #include <unistd.h>
  15 #include <utmpx.h>
  16
  17 #if HAVE_PAM
  18 #include <security/pam_appl.h>
  19 #endif
  20
  21 #if HAVE_SELINUX
  22 #include <selinux/selinux.h>
  23 #endif
  24
  25 #if HAVE_SECCOMP
  26 #include <seccomp.h>
  27 #endif
  28
  29 #if HAVE_APPARMOR
  30 #include <sys/apparmor.h>
  31 #endif
  32
  33 #include "sd-messages.h"
  34
  35 #include "af-list.h"
  36 #include "alloc-util.h"
  37 #if HAVE_APPARMOR
  38 #include "apparmor-util.h"
  39 #endif
  40 #include "async.h"
  41 #include "barrier.h"
  42 #include "cap-list.h"
  43 #include "capability-util.h"
  44 #include "chown-recursive.h"
  45 #include "cgroup-setup.h"
  46 #include "cpu-set-util.h"
  47 #include "def.h"
  48 #include "env-file.h"
  49 #include "env-util.h"
  50 #include "errno-list.h"
  51 #include "execute.h"
  52 #include "exit-status.h"
  53 #include "fd-util.h"
  54 #include "format-util.h"
  55 #include "fs-util.h"
  56 #include "glob-util.h"
  57 #include "hexdecoct.h"
  58 #include "io-util.h"
  59 #include "ioprio.h"
  60 #include "label.h"
  61 #include "log.h"
  62 #include "macro.h"
  63 #include "manager.h"
  64 #include "memory-util.h"
  65 #include "missing_fs.h"
  66 #include "mkdir.h"
  67 #include "namespace.h"
  68 #include "parse-util.h"
  69 #include "path-util.h"
  70 #include "process-util.h"
  71 #include "rlimit-util.h"
  72 #include "rm-rf.h"
  73 #if HAVE_SECCOMP
  74 #include "seccomp-util.h"
  75 #endif
  76 #include "securebits-util.h"
  77 #include "selinux-util.h"
  78 #include "signal-util.h"
  79 #include "smack-util.h"
  80 #include "socket-util.h"
  81 #include "special.h"
  82 #include "stat-util.h"
  83 #include "string-table.h"
  84 #include "string-util.h"
  85 #include "strv.h"
  86 #include "syslog-util.h"
  87 #include "terminal-util.h"
  88 #include "umask-util.h"
  89 #include "unit.h"
  90 #include "user-util.h"
  91 #include "utmp-wtmp.h"
  92
  93 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
  94 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
  95
  96 #define SNDBUF_SIZE (8*1024*1024)
  97
  98 static int shift_fds(int fds[], size_t n_fds) {
  99         int start, restart_from;
 100
 101         if (n_fds <= 0)
 102                 return 0;
 103
 104         /* Modifies the fds array! (sorts it) */
 105
 106         assert(fds);
 107
 108         start = 0;
 109         for (;;) {
 110                 int i;
 111
 112                 restart_from = -1;
 113
 114                 for (i = start; i < (int) n_fds; i++) {
 115                         int nfd;
 116
 117                         /* Already at right index? */
 118                         if (fds[i] == i+3)
 119                                 continue;
 120
 121                         nfd = fcntl(fds[i], F_DUPFD, i + 3);
 122                         if (nfd < 0)
 123                                 return -errno;
 124
 125                         safe_close(fds[i]);
 126                         fds[i] = nfd;
 127
 128                         /* Hmm, the fd we wanted isn't free? Then
 129                          * let's remember that and try again from here */
 130                         if (nfd != i+3 && restart_from < 0)
 131                                 restart_from = i;
 132                 }
 133
 134                 if (restart_from < 0)
 135                         break;
 136
 137                 start = restart_from;
 138         }
 139
 140         return 0;
 141 }
 142
 143 static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
 144         size_t i, n_fds;
 145         int r;
 146
 147         n_fds = n_socket_fds + n_storage_fds;
 148         if (n_fds <= 0)
 149                 return 0;
 150
 151         assert(fds);
 152
 153         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
 154          * O_NONBLOCK only applies to socket activation though. */
 155
 156         for (i = 0; i < n_fds; i++) {
 157
 158                 if (i < n_socket_fds) {
 159                         r = fd_nonblock(fds[i], nonblock);
 160                         if (r < 0)
 161                                 return r;
 162                 }
 163
 164                 /* We unconditionally drop FD_CLOEXEC from the fds,
 165                  * since after all we want to pass these fds to our
 166                  * children */
 167
 168                 r = fd_cloexec(fds[i], false);
 169                 if (r < 0)
 170                         return r;
 171         }
 172
 173         return 0;
 174 }
 175
 176 static const char *exec_context_tty_path(const ExecContext *context) {
 177         assert(context);
 178
 179         if (context->stdio_as_fds)
 180                 return NULL;
 181
 182         if (context->tty_path)
 183                 return context->tty_path;
 184
 185         return "/dev/console";
 186 }
 187
 188 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
 189         const char *path;
 190
 191         assert(context);
 192
 193         path = exec_context_tty_path(context);
 194
 195         if (context->tty_vhangup) {
 196                 if (p && p->stdin_fd >= 0)
 197                         (void) terminal_vhangup_fd(p->stdin_fd);
 198                 else if (path)
 199                         (void) terminal_vhangup(path);
 200         }
 201
 202         if (context->tty_reset) {
 203                 if (p && p->stdin_fd >= 0)
 204                         (void) reset_terminal_fd(p->stdin_fd, true);
 205                 else if (path)
 206                         (void) reset_terminal(path);
 207         }
 208
 209         if (context->tty_vt_disallocate && path)
 210                 (void) vt_disallocate(path);
 211 }
 212
 213 static bool is_terminal_input(ExecInput i) {
 214         return IN_SET(i,
 215                       EXEC_INPUT_TTY,
 216                       EXEC_INPUT_TTY_FORCE,
 217                       EXEC_INPUT_TTY_FAIL);
 218 }
 219
 220 static bool is_terminal_output(ExecOutput o) {
 221         return IN_SET(o,
 222                       EXEC_OUTPUT_TTY,
 223                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 224                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 225 }
 226
 227 static bool is_kmsg_output(ExecOutput o) {
 228         return IN_SET(o,
 229                       EXEC_OUTPUT_KMSG,
 230                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 231 }
 232
 233 static bool exec_context_needs_term(const ExecContext *c) {
 234         assert(c);
 235
 236         /* Return true if the execution context suggests we should set $TERM to something useful. */
 237
 238         if (is_terminal_input(c->std_input))
 239                 return true;
 240
 241         if (is_terminal_output(c->std_output))
 242                 return true;
 243
 244         if (is_terminal_output(c->std_error))
 245                 return true;
 246
 247         return !!c->tty_path;
 248 }
 249
 250 static int open_null_as(int flags, int nfd) {
 251         int fd;
 252
 253         assert(nfd >= 0);
 254
 255         fd = open("/dev/null", flags|O_NOCTTY);
 256         if (fd < 0)
 257                 return -errno;
 258
 259         return move_fd(fd, nfd, false);
 260 }
 261
 262 static int connect_journal_socket(
 263                 int fd,
 264                 const char *log_namespace,
 265                 uid_t uid,
 266                 gid_t gid) {
 267
 268         union sockaddr_union sa;
 269         socklen_t sa_len;
 270         uid_t olduid = UID_INVALID;
 271         gid_t oldgid = GID_INVALID;
 272         const char *j;
 273         int r;
 274
 275         j = log_namespace ?
 276                 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
 277                 "/run/systemd/journal/stdout";
 278         r = sockaddr_un_set_path(&sa.un, j);
 279         if (r < 0)
 280                 return r;
 281         sa_len = r;
 282
 283         if (gid_is_valid(gid)) {
 284                 oldgid = getgid();
 285
 286                 if (setegid(gid) < 0)
 287                         return -errno;
 288         }
 289
 290         if (uid_is_valid(uid)) {
 291                 olduid = getuid();
 292
 293                 if (seteuid(uid) < 0) {
 294                         r = -errno;
 295                         goto restore_gid;
 296                 }
 297         }
 298
 299         r = connect(fd, &sa.sa, sa_len) < 0 ? -errno : 0;
 300
 301         /* If we fail to restore the uid or gid, things will likely
 302            fail later on. This should only happen if an LSM interferes. */
 303
 304         if (uid_is_valid(uid))
 305                 (void) seteuid(olduid);
 306
 307  restore_gid:
 308         if (gid_is_valid(gid))
 309                 (void) setegid(oldgid);
 310
 311         return r;
 312 }
 313
 314 static int connect_logger_as(
 315                 const Unit *unit,
 316                 const ExecContext *context,
 317                 const ExecParameters *params,
 318                 ExecOutput output,
 319                 const char *ident,
 320                 int nfd,
 321                 uid_t uid,
 322                 gid_t gid) {
 323
 324         _cleanup_close_ int fd = -1;
 325         int r;
 326
 327         assert(context);
 328         assert(params);
 329         assert(output < _EXEC_OUTPUT_MAX);
 330         assert(ident);
 331         assert(nfd >= 0);
 332
 333         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 334         if (fd < 0)
 335                 return -errno;
 336
 337         r = connect_journal_socket(fd, context->log_namespace, uid, gid);
 338         if (r < 0)
 339                 return r;
 340
 341         if (shutdown(fd, SHUT_RD) < 0)
 342                 return -errno;
 343
 344         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 345
 346         if (dprintf(fd,
 347                 "%s\n"
 348                 "%s\n"
 349                 "%i\n"
 350                 "%i\n"
 351                 "%i\n"
 352                 "%i\n"
 353                 "%i\n",
 354                 context->syslog_identifier ?: ident,
 355                 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
 356                 context->syslog_priority,
 357                 !!context->syslog_level_prefix,
 358                 false,
 359                 is_kmsg_output(output),
 360                 is_terminal_output(output)) < 0)
 361                 return -errno;
 362
 363         return move_fd(TAKE_FD(fd), nfd, false);
 364 }
 365
 366 static int open_terminal_as(const char *path, int flags, int nfd) {
 367         int fd;
 368
 369         assert(path);
 370         assert(nfd >= 0);
 371
 372         fd = open_terminal(path, flags | O_NOCTTY);
 373         if (fd < 0)
 374                 return fd;
 375
 376         return move_fd(fd, nfd, false);
 377 }
 378
 379 static int acquire_path(const char *path, int flags, mode_t mode) {
 380         union sockaddr_union sa;
 381         socklen_t sa_len;
 382         _cleanup_close_ int fd = -1;
 383         int r;
 384
 385         assert(path);
 386
 387         if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
 388                 flags |= O_CREAT;
 389
 390         fd = open(path, flags|O_NOCTTY, mode);
 391         if (fd >= 0)
 392                 return TAKE_FD(fd);
 393
 394         if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
 395                 return -errno;
 396
 397         /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
 398
 399         r = sockaddr_un_set_path(&sa.un, path);
 400         if (r < 0)
 401                 return r == -EINVAL ? -ENXIO : r;
 402         sa_len = r;
 403
 404         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 405         if (fd < 0)
 406                 return -errno;
 407
 408         if (connect(fd, &sa.sa, sa_len) < 0)
 409                 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
 410                                                            * indication that his wasn't an AF_UNIX socket after all */
 411
 412         if ((flags & O_ACCMODE) == O_RDONLY)
 413                 r = shutdown(fd, SHUT_WR);
 414         else if ((flags & O_ACCMODE) == O_WRONLY)
 415                 r = shutdown(fd, SHUT_RD);
 416         else
 417                 r = 0;
 418         if (r < 0)
 419                 return -errno;
 420
 421         return TAKE_FD(fd);
 422 }
 423
 424 static int fixup_input(
 425                 const ExecContext *context,
 426                 int socket_fd,
 427                 bool apply_tty_stdin) {
 428
 429         ExecInput std_input;
 430
 431         assert(context);
 432
 433         std_input = context->std_input;
 434
 435         if (is_terminal_input(std_input) && !apply_tty_stdin)
 436                 return EXEC_INPUT_NULL;
 437
 438         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 439                 return EXEC_INPUT_NULL;
 440
 441         if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
 442                 return EXEC_INPUT_NULL;
 443
 444         return std_input;
 445 }
 446
 447 static int fixup_output(ExecOutput std_output, int socket_fd) {
 448
 449         if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 450                 return EXEC_OUTPUT_INHERIT;
 451
 452         return std_output;
 453 }
 454
 455 static int setup_input(
 456                 const ExecContext *context,
 457                 const ExecParameters *params,
 458                 int socket_fd,
 459                 const int named_iofds[static 3]) {
 460
 461         ExecInput i;
 462
 463         assert(context);
 464         assert(params);
 465         assert(named_iofds);
 466
 467         if (params->stdin_fd >= 0) {
 468                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 469                         return -errno;
 470
 471                 /* Try to make this the controlling tty, if it is a tty, and reset it */
 472                 if (isatty(STDIN_FILENO)) {
 473                         (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 474                         (void) reset_terminal_fd(STDIN_FILENO, true);
 475                 }
 476
 477                 return STDIN_FILENO;
 478         }
 479
 480         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 481
 482         switch (i) {
 483
 484         case EXEC_INPUT_NULL:
 485                 return open_null_as(O_RDONLY, STDIN_FILENO);
 486
 487         case EXEC_INPUT_TTY:
 488         case EXEC_INPUT_TTY_FORCE:
 489         case EXEC_INPUT_TTY_FAIL: {
 490                 int fd;
 491
 492                 fd = acquire_terminal(exec_context_tty_path(context),
 493                                       i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
 494                                       i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
 495                                                                   ACQUIRE_TERMINAL_WAIT,
 496                                       USEC_INFINITY);
 497                 if (fd < 0)
 498                         return fd;
 499
 500                 return move_fd(fd, STDIN_FILENO, false);
 501         }
 502
 503         case EXEC_INPUT_SOCKET:
 504                 assert(socket_fd >= 0);
 505
 506                 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 507
 508         case EXEC_INPUT_NAMED_FD:
 509                 assert(named_iofds[STDIN_FILENO] >= 0);
 510
 511                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 512                 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 513
 514         case EXEC_INPUT_DATA: {
 515                 int fd;
 516
 517                 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
 518                 if (fd < 0)
 519                         return fd;
 520
 521                 return move_fd(fd, STDIN_FILENO, false);
 522         }
 523
 524         case EXEC_INPUT_FILE: {
 525                 bool rw;
 526                 int fd;
 527
 528                 assert(context->stdio_file[STDIN_FILENO]);
 529
 530                 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
 531                         (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
 532
 533                 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
 534                 if (fd < 0)
 535                         return fd;
 536
 537                 return move_fd(fd, STDIN_FILENO, false);
 538         }
 539
 540         default:
 541                 assert_not_reached("Unknown input type");
 542         }
 543 }
 544
 545 static bool can_inherit_stderr_from_stdout(
 546                 const ExecContext *context,
 547                 ExecOutput o,
 548                 ExecOutput e) {
 549
 550         assert(context);
 551
 552         /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
 553          * stderr fd */
 554
 555         if (e == EXEC_OUTPUT_INHERIT)
 556                 return true;
 557         if (e != o)
 558                 return false;
 559
 560         if (e == EXEC_OUTPUT_NAMED_FD)
 561                 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
 562
 563         if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND))
 564                 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
 565
 566         return true;
 567 }
 568
 569 static int setup_output(
 570                 const Unit *unit,
 571                 const ExecContext *context,
 572                 const ExecParameters *params,
 573                 int fileno,
 574                 int socket_fd,
 575                 const int named_iofds[static 3],
 576                 const char *ident,
 577                 uid_t uid,
 578                 gid_t gid,
 579                 dev_t *journal_stream_dev,
 580                 ino_t *journal_stream_ino) {
 581
 582         ExecOutput o;
 583         ExecInput i;
 584         int r;
 585
 586         assert(unit);
 587         assert(context);
 588         assert(params);
 589         assert(ident);
 590         assert(journal_stream_dev);
 591         assert(journal_stream_ino);
 592
 593         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 594
 595                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 596                         return -errno;
 597
 598                 return STDOUT_FILENO;
 599         }
 600
 601         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 602                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 603                         return -errno;
 604
 605                 return STDERR_FILENO;
 606         }
 607
 608         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 609         o = fixup_output(context->std_output, socket_fd);
 610
 611         if (fileno == STDERR_FILENO) {
 612                 ExecOutput e;
 613                 e = fixup_output(context->std_error, socket_fd);
 614
 615                 /* This expects the input and output are already set up */
 616
 617                 /* Don't change the stderr file descriptor if we inherit all
 618                  * the way and are not on a tty */
 619                 if (e == EXEC_OUTPUT_INHERIT &&
 620                     o == EXEC_OUTPUT_INHERIT &&
 621                     i == EXEC_INPUT_NULL &&
 622                     !is_terminal_input(context->std_input) &&
 623                     getppid () != 1)
 624                         return fileno;
 625
 626                 /* Duplicate from stdout if possible */
 627                 if (can_inherit_stderr_from_stdout(context, o, e))
 628                         return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
 629
 630                 o = e;
 631
 632         } else if (o == EXEC_OUTPUT_INHERIT) {
 633                 /* If input got downgraded, inherit the original value */
 634                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 635                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 636
 637                 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
 638                 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
 639                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 640
 641                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 642                 if (getppid() != 1)
 643                         return fileno;
 644
 645                 /* We need to open /dev/null here anew, to get the right access mode. */
 646                 return open_null_as(O_WRONLY, fileno);
 647         }
 648
 649         switch (o) {
 650
 651         case EXEC_OUTPUT_NULL:
 652                 return open_null_as(O_WRONLY, fileno);
 653
 654         case EXEC_OUTPUT_TTY:
 655                 if (is_terminal_input(i))
 656                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 657
 658                 /* We don't reset the terminal if this is just about output */
 659                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 660
 661         case EXEC_OUTPUT_KMSG:
 662         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 663         case EXEC_OUTPUT_JOURNAL:
 664         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 665                 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
 666                 if (r < 0) {
 667                         log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
 668                         r = open_null_as(O_WRONLY, fileno);
 669                 } else {
 670                         struct stat st;
 671
 672                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 673                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 674                          * services to detect whether they are connected to the journal or not.
 675                          *
 676                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
 677                          * about STDERR as that's usually the best way to do logging. */
 678
 679                         if (fstat(fileno, &st) >= 0 &&
 680                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
 681                                 *journal_stream_dev = st.st_dev;
 682                                 *journal_stream_ino = st.st_ino;
 683                         }
 684                 }
 685                 return r;
 686
 687         case EXEC_OUTPUT_SOCKET:
 688                 assert(socket_fd >= 0);
 689
 690                 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
 691
 692         case EXEC_OUTPUT_NAMED_FD:
 693                 assert(named_iofds[fileno] >= 0);
 694
 695                 (void) fd_nonblock(named_iofds[fileno], false);
 696                 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
 697
 698         case EXEC_OUTPUT_FILE:
 699         case EXEC_OUTPUT_FILE_APPEND: {
 700                 bool rw;
 701                 int fd, flags;
 702
 703                 assert(context->stdio_file[fileno]);
 704
 705                 rw = context->std_input == EXEC_INPUT_FILE &&
 706                         streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
 707
 708                 if (rw)
 709                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 710
 711                 flags = O_WRONLY;
 712                 if (o == EXEC_OUTPUT_FILE_APPEND)
 713                         flags |= O_APPEND;
 714
 715                 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
 716                 if (fd < 0)
 717                         return fd;
 718
 719                 return move_fd(fd, fileno, 0);
 720         }
 721
 722         default:
 723                 assert_not_reached("Unknown error type");
 724         }
 725 }
 726
 727 static int chown_terminal(int fd, uid_t uid) {
 728         int r;
 729
 730         assert(fd >= 0);
 731
 732         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 733         if (isatty(fd) < 1) {
 734                 if (IN_SET(errno, EINVAL, ENOTTY))
 735                         return 0; /* not a tty */
 736
 737                 return -errno;
 738         }
 739
 740         /* This might fail. What matters are the results. */
 741         r = fchmod_and_chown(fd, TTY_MODE, uid, -1);
 742         if (r < 0)
 743                 return r;
 744
 745         return 1;
 746 }
 747
 748 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
 749         _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
 750         int r;
 751
 752         assert(_saved_stdin);
 753         assert(_saved_stdout);
 754
 755         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 756         if (saved_stdin < 0)
 757                 return -errno;
 758
 759         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
 760         if (saved_stdout < 0)
 761                 return -errno;
 762
 763         fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
 764         if (fd < 0)
 765                 return fd;
 766
 767         r = chown_terminal(fd, getuid());
 768         if (r < 0)
 769                 return r;
 770
 771         r = reset_terminal_fd(fd, true);
 772         if (r < 0)
 773                 return r;
 774
 775         r = rearrange_stdio(fd, fd, STDERR_FILENO);
 776         fd = -1;
 777         if (r < 0)
 778                 return r;
 779
 780         *_saved_stdin = saved_stdin;
 781         *_saved_stdout = saved_stdout;
 782
 783         saved_stdin = saved_stdout = -1;
 784
 785         return 0;
 786 }
 787
 788 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
 789         assert(err < 0);
 790
 791         if (err == -ETIMEDOUT)
 792                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
 793         else {
 794                 errno = -err;
 795                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
 796         }
 797 }
 798
 799 static void write_confirm_error(int err, const char *vc, const Unit *u) {
 800         _cleanup_close_ int fd = -1;
 801
 802         assert(vc);
 803
 804         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 805         if (fd < 0)
 806                 return;
 807
 808         write_confirm_error_fd(err, fd, u);
 809 }
 810
 811 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 812         int r = 0;
 813
 814         assert(saved_stdin);
 815         assert(saved_stdout);
 816
 817         release_terminal();
 818
 819         if (*saved_stdin >= 0)
 820                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 821                         r = -errno;
 822
 823         if (*saved_stdout >= 0)
 824                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 825                         r = -errno;
 826
 827         *saved_stdin = safe_close(*saved_stdin);
 828         *saved_stdout = safe_close(*saved_stdout);
 829
 830         return r;
 831 }
 832
 833 enum {
 834         CONFIRM_PRETEND_FAILURE = -1,
 835         CONFIRM_PRETEND_SUCCESS =  0,
 836         CONFIRM_EXECUTE = 1,
 837 };
 838
 839 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
 840         int saved_stdout = -1, saved_stdin = -1, r;
 841         _cleanup_free_ char *e = NULL;
 842         char c;
 843
 844         /* For any internal errors, assume a positive response. */
 845         r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
 846         if (r < 0) {
 847                 write_confirm_error(r, vc, u);
 848                 return CONFIRM_EXECUTE;
 849         }
 850
 851         /* confirm_spawn might have been disabled while we were sleeping. */
 852         if (manager_is_confirm_spawn_disabled(u->manager)) {
 853                 r = 1;
 854                 goto restore_stdio;
 855         }
 856
 857         e = ellipsize(cmdline, 60, 100);
 858         if (!e) {
 859                 log_oom();
 860                 r = CONFIRM_EXECUTE;
 861                 goto restore_stdio;
 862         }
 863
 864         for (;;) {
 865                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 866                 if (r < 0) {
 867                         write_confirm_error_fd(r, STDOUT_FILENO, u);
 868                         r = CONFIRM_EXECUTE;
 869                         goto restore_stdio;
 870                 }
 871
 872                 switch (c) {
 873                 case 'c':
 874                         printf("Resuming normal execution.\n");
 875                         manager_disable_confirm_spawn();
 876                         r = 1;
 877                         break;
 878                 case 'D':
 879                         unit_dump(u, stdout, "  ");
 880                         continue; /* ask again */
 881                 case 'f':
 882                         printf("Failing execution.\n");
 883                         r = CONFIRM_PRETEND_FAILURE;
 884                         break;
 885                 case 'h':
 886                         printf("  c - continue, proceed without asking anymore\n"
 887                                "  D - dump, show the state of the unit\n"
 888                                "  f - fail, don't execute the command and pretend it failed\n"
 889                                "  h - help\n"
 890                                "  i - info, show a short summary of the unit\n"
 891                                "  j - jobs, show jobs that are in progress\n"
 892                                "  s - skip, don't execute the command and pretend it succeeded\n"
 893                                "  y - yes, execute the command\n");
 894                         continue; /* ask again */
 895                 case 'i':
 896                         printf("  Description: %s\n"
 897                                "  Unit:        %s\n"
 898                                "  Command:     %s\n",
 899                                u->id, u->description, cmdline);
 900                         continue; /* ask again */
 901                 case 'j':
 902                         manager_dump_jobs(u->manager, stdout, "  ");
 903                         continue; /* ask again */
 904                 case 'n':
 905                         /* 'n' was removed in favor of 'f'. */
 906                         printf("Didn't understand 'n', did you mean 'f'?\n");
 907                         continue; /* ask again */
 908                 case 's':
 909                         printf("Skipping execution.\n");
 910                         r = CONFIRM_PRETEND_SUCCESS;
 911                         break;
 912                 case 'y':
 913                         r = CONFIRM_EXECUTE;
 914                         break;
 915                 default:
 916                         assert_not_reached("Unhandled choice");
 917                 }
 918                 break;
 919         }
 920
 921 restore_stdio:
 922         restore_confirm_stdio(&saved_stdin, &saved_stdout);
 923         return r;
 924 }
 925
 926 static int get_fixed_user(const ExecContext *c, const char **user,
 927                           uid_t *uid, gid_t *gid,
 928                           const char **home, const char **shell) {
 929         int r;
 930         const char *name;
 931
 932         assert(c);
 933
 934         if (!c->user)
 935                 return 0;
 936
 937         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
 938          * (i.e. are "/" or "/bin/nologin"). */
 939
 940         name = c->user;
 941         r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
 942         if (r < 0)
 943                 return r;
 944
 945         *user = name;
 946         return 0;
 947 }
 948
 949 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
 950         int r;
 951         const char *name;
 952
 953         assert(c);
 954
 955         if (!c->group)
 956                 return 0;
 957
 958         name = c->group;
 959         r = get_group_creds(&name, gid, 0);
 960         if (r < 0)
 961                 return r;
 962
 963         *group = name;
 964         return 0;
 965 }
 966
 967 static int get_supplementary_groups(const ExecContext *c, const char *user,
 968                                     const char *group, gid_t gid,
 969                                     gid_t **supplementary_gids, int *ngids) {
 970         char **i;
 971         int r, k = 0;
 972         int ngroups_max;
 973         bool keep_groups = false;
 974         gid_t *groups = NULL;
 975         _cleanup_free_ gid_t *l_gids = NULL;
 976
 977         assert(c);
 978
 979         /*
 980          * If user is given, then lookup GID and supplementary groups list.
 981          * We avoid NSS lookups for gid=0. Also we have to initialize groups
 982          * here and as early as possible so we keep the list of supplementary
 983          * groups of the caller.
 984          */
 985         if (user && gid_is_valid(gid) && gid != 0) {
 986                 /* First step, initialize groups from /etc/groups */
 987                 if (initgroups(user, gid) < 0)
 988                         return -errno;
 989
 990                 keep_groups = true;
 991         }
 992
 993         if (strv_isempty(c->supplementary_groups))
 994                 return 0;
 995
 996         /*
 997          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
 998          * be positive, otherwise fail.
 999          */
1000         errno = 0;
1001         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1002         if (ngroups_max <= 0)
1003                 return errno_or_else(EOPNOTSUPP);
1004
1005         l_gids = new(gid_t, ngroups_max);
1006         if (!l_gids)
1007                 return -ENOMEM;
1008
1009         if (keep_groups) {
1010                 /*
1011                  * Lookup the list of groups that the user belongs to, we
1012                  * avoid NSS lookups here too for gid=0.
1013                  */
1014                 k = ngroups_max;
1015                 if (getgrouplist(user, gid, l_gids, &k) < 0)
1016                         return -EINVAL;
1017         } else
1018                 k = 0;
1019
1020         STRV_FOREACH(i, c->supplementary_groups) {
1021                 const char *g;
1022
1023                 if (k >= ngroups_max)
1024                         return -E2BIG;
1025
1026                 g = *i;
1027                 r = get_group_creds(&g, l_gids+k, 0);
1028                 if (r < 0)
1029                         return r;
1030
1031                 k++;
1032         }
1033
1034         /*
1035          * Sets ngids to zero to drop all supplementary groups, happens
1036          * when we are under root and SupplementaryGroups= is empty.
1037          */
1038         if (k == 0) {
1039                 *ngids = 0;
1040                 return 0;
1041         }
1042
1043         /* Otherwise get the final list of supplementary groups */
1044         groups = memdup(l_gids, sizeof(gid_t) * k);
1045         if (!groups)
1046                 return -ENOMEM;
1047
1048         *supplementary_gids = groups;
1049         *ngids = k;
1050
1051         groups = NULL;
1052
1053         return 0;
1054 }
1055
1056 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1057         int r;
1058
1059         /* Handle SupplementaryGroups= if it is not empty */
1060         if (ngids > 0) {
1061                 r = maybe_setgroups(ngids, supplementary_gids);
1062                 if (r < 0)
1063                         return r;
1064         }
1065
1066         if (gid_is_valid(gid)) {
1067                 /* Then set our gids */
1068                 if (setresgid(gid, gid, gid) < 0)
1069                         return -errno;
1070         }
1071
1072         return 0;
1073 }
1074
1075 static int enforce_user(const ExecContext *context, uid_t uid) {
1076         assert(context);
1077
1078         if (!uid_is_valid(uid))
1079                 return 0;
1080
1081         /* Sets (but doesn't look up) the uid and make sure we keep the
1082          * capabilities while doing so. */
1083
1084         if (context->capability_ambient_set != 0) {
1085
1086                 /* First step: If we need to keep capabilities but
1087                  * drop privileges we need to make sure we keep our
1088                  * caps, while we drop privileges. */
1089                 if (uid != 0) {
1090                         int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
1091
1092                         if (prctl(PR_GET_SECUREBITS) != sb)
1093                                 if (prctl(PR_SET_SECUREBITS, sb) < 0)
1094                                         return -errno;
1095                 }
1096         }
1097
1098         /* Second step: actually set the uids */
1099         if (setresuid(uid, uid, uid) < 0)
1100                 return -errno;
1101
1102         /* At this point we should have all necessary capabilities but
1103            are otherwise a normal user. However, the caps might got
1104            corrupted due to the setresuid() so we need clean them up
1105            later. This is done outside of this call. */
1106
1107         return 0;
1108 }
1109
1110 #if HAVE_PAM
1111
1112 static int null_conv(
1113                 int num_msg,
1114                 const struct pam_message **msg,
1115                 struct pam_response **resp,
1116                 void *appdata_ptr) {
1117
1118         /* We don't support conversations */
1119
1120         return PAM_CONV_ERR;
1121 }
1122
1123 #endif
1124
1125 static int setup_pam(
1126                 const char *name,
1127                 const char *user,
1128                 uid_t uid,
1129                 gid_t gid,
1130                 const char *tty,
1131                 char ***env,
1132                 const int fds[], size_t n_fds) {
1133
1134 #if HAVE_PAM
1135
1136         static const struct pam_conv conv = {
1137                 .conv = null_conv,
1138                 .appdata_ptr = NULL
1139         };
1140
1141         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1142         pam_handle_t *handle = NULL;
1143         sigset_t old_ss;
1144         int pam_code = PAM_SUCCESS, r;
1145         char **nv, **e = NULL;
1146         bool close_session = false;
1147         pid_t pam_pid = 0, parent_pid;
1148         int flags = 0;
1149
1150         assert(name);
1151         assert(user);
1152         assert(env);
1153
1154         /* We set up PAM in the parent process, then fork. The child
1155          * will then stay around until killed via PR_GET_PDEATHSIG or
1156          * systemd via the cgroup logic. It will then remove the PAM
1157          * session again. The parent process will exec() the actual
1158          * daemon. We do things this way to ensure that the main PID
1159          * of the daemon is the one we initially fork()ed. */
1160
1161         r = barrier_create(&barrier);
1162         if (r < 0)
1163                 goto fail;
1164
1165         if (log_get_max_level() < LOG_DEBUG)
1166                 flags |= PAM_SILENT;
1167
1168         pam_code = pam_start(name, user, &conv, &handle);
1169         if (pam_code != PAM_SUCCESS) {
1170                 handle = NULL;
1171                 goto fail;
1172         }
1173
1174         if (!tty) {
1175                 _cleanup_free_ char *q = NULL;
1176
1177                 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1178                  * out if that's the case, and read the TTY off it. */
1179
1180                 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1181                         tty = strjoina("/dev/", q);
1182         }
1183
1184         if (tty) {
1185                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1186                 if (pam_code != PAM_SUCCESS)
1187                         goto fail;
1188         }
1189
1190         STRV_FOREACH(nv, *env) {
1191                 pam_code = pam_putenv(handle, *nv);
1192                 if (pam_code != PAM_SUCCESS)
1193                         goto fail;
1194         }
1195
1196         pam_code = pam_acct_mgmt(handle, flags);
1197         if (pam_code != PAM_SUCCESS)
1198                 goto fail;
1199
1200         pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1201         if (pam_code != PAM_SUCCESS)
1202                 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
1203
1204         pam_code = pam_open_session(handle, flags);
1205         if (pam_code != PAM_SUCCESS)
1206                 goto fail;
1207
1208         close_session = true;
1209
1210         e = pam_getenvlist(handle);
1211         if (!e) {
1212                 pam_code = PAM_BUF_ERR;
1213                 goto fail;
1214         }
1215
1216         /* Block SIGTERM, so that we know that it won't get lost in
1217          * the child */
1218
1219         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1220
1221         parent_pid = getpid_cached();
1222
1223         r = safe_fork("(sd-pam)", 0, &pam_pid);
1224         if (r < 0)
1225                 goto fail;
1226         if (r == 0) {
1227                 int sig, ret = EXIT_PAM;
1228
1229                 /* The child's job is to reset the PAM session on
1230                  * termination */
1231                 barrier_set_role(&barrier, BARRIER_CHILD);
1232
1233                 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only those fds
1234                  * are open here that have been opened by PAM. */
1235                 (void) close_many(fds, n_fds);
1236
1237                 /* Drop privileges - we don't need any to pam_close_session
1238                  * and this will make PR_SET_PDEATHSIG work in most cases.
1239                  * If this fails, ignore the error - but expect sd-pam threads
1240                  * to fail to exit normally */
1241
1242                 r = maybe_setgroups(0, NULL);
1243                 if (r < 0)
1244                         log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1245                 if (setresgid(gid, gid, gid) < 0)
1246                         log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1247                 if (setresuid(uid, uid, uid) < 0)
1248                         log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1249
1250                 (void) ignore_signals(SIGPIPE, -1);
1251
1252                 /* Wait until our parent died. This will only work if
1253                  * the above setresuid() succeeds, otherwise the kernel
1254                  * will not allow unprivileged parents kill their privileged
1255                  * children this way. We rely on the control groups kill logic
1256                  * to do the rest for us. */
1257                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1258                         goto child_finish;
1259
1260                 /* Tell the parent that our setup is done. This is especially
1261                  * important regarding dropping privileges. Otherwise, unit
1262                  * setup might race against our setresuid(2) call.
1263                  *
1264                  * If the parent aborted, we'll detect this below, hence ignore
1265                  * return failure here. */
1266                 (void) barrier_place(&barrier);
1267
1268                 /* Check if our parent process might already have died? */
1269                 if (getppid() == parent_pid) {
1270                         sigset_t ss;
1271
1272                         assert_se(sigemptyset(&ss) >= 0);
1273                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1274
1275                         for (;;) {
1276                                 if (sigwait(&ss, &sig) < 0) {
1277                                         if (errno == EINTR)
1278                                                 continue;
1279
1280                                         goto child_finish;
1281                                 }
1282
1283                                 assert(sig == SIGTERM);
1284                                 break;
1285                         }
1286                 }
1287
1288                 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1289                 if (pam_code != PAM_SUCCESS)
1290                         goto child_finish;
1291
1292                 /* If our parent died we'll end the session */
1293                 if (getppid() != parent_pid) {
1294                         pam_code = pam_close_session(handle, flags);
1295                         if (pam_code != PAM_SUCCESS)
1296                                 goto child_finish;
1297                 }
1298
1299                 ret = 0;
1300
1301         child_finish:
1302                 pam_end(handle, pam_code | flags);
1303                 _exit(ret);
1304         }
1305
1306         barrier_set_role(&barrier, BARRIER_PARENT);
1307
1308         /* If the child was forked off successfully it will do all the
1309          * cleanups, so forget about the handle here. */
1310         handle = NULL;
1311
1312         /* Unblock SIGTERM again in the parent */
1313         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1314
1315         /* We close the log explicitly here, since the PAM modules
1316          * might have opened it, but we don't want this fd around. */
1317         closelog();
1318
1319         /* Synchronously wait for the child to initialize. We don't care for
1320          * errors as we cannot recover. However, warn loudly if it happens. */
1321         if (!barrier_place_and_sync(&barrier))
1322                 log_error("PAM initialization failed");
1323
1324         return strv_free_and_replace(*env, e);
1325
1326 fail:
1327         if (pam_code != PAM_SUCCESS) {
1328                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1329                 r = -EPERM;  /* PAM errors do not map to errno */
1330         } else
1331                 log_error_errno(r, "PAM failed: %m");
1332
1333         if (handle) {
1334                 if (close_session)
1335                         pam_code = pam_close_session(handle, flags);
1336
1337                 pam_end(handle, pam_code | flags);
1338         }
1339
1340         strv_free(e);
1341         closelog();
1342
1343         return r;
1344 #else
1345         return 0;
1346 #endif
1347 }
1348
1349 static void rename_process_from_path(const char *path) {
1350         char process_name[11];
1351         const char *p;
1352         size_t l;
1353
1354         /* This resulting string must fit in 10 chars (i.e. the length
1355          * of "/sbin/init") to look pretty in /bin/ps */
1356
1357         p = basename(path);
1358         if (isempty(p)) {
1359                 rename_process("(...)");
1360                 return;
1361         }
1362
1363         l = strlen(p);
1364         if (l > 8) {
1365                 /* The end of the process name is usually more
1366                  * interesting, since the first bit might just be
1367                  * "systemd-" */
1368                 p = p + l - 8;
1369                 l = 8;
1370         }
1371
1372         process_name[0] = '(';
1373         memcpy(process_name+1, p, l);
1374         process_name[1+l] = ')';
1375         process_name[1+l+1] = 0;
1376
1377         rename_process(process_name);
1378 }
1379
1380 static bool context_has_address_families(const ExecContext *c) {
1381         assert(c);
1382
1383         return c->address_families_allow_list ||
1384                 !set_isempty(c->address_families);
1385 }
1386
1387 static bool context_has_syscall_filters(const ExecContext *c) {
1388         assert(c);
1389
1390         return c->syscall_allow_list ||
1391                 !hashmap_isempty(c->syscall_filter);
1392 }
1393
1394 static bool context_has_no_new_privileges(const ExecContext *c) {
1395         assert(c);
1396
1397         if (c->no_new_privileges)
1398                 return true;
1399
1400         if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1401                 return false;
1402
1403         /* We need NNP if we have any form of seccomp and are unprivileged */
1404         return context_has_address_families(c) ||
1405                 c->memory_deny_write_execute ||
1406                 c->restrict_realtime ||
1407                 c->restrict_suid_sgid ||
1408                 exec_context_restrict_namespaces_set(c) ||
1409                 c->protect_clock ||
1410                 c->protect_kernel_tunables ||
1411                 c->protect_kernel_modules ||
1412                 c->protect_kernel_logs ||
1413                 c->private_devices ||
1414                 context_has_syscall_filters(c) ||
1415                 !set_isempty(c->syscall_archs) ||
1416                 c->lock_personality ||
1417                 c->protect_hostname;
1418 }
1419
1420 #if HAVE_SECCOMP
1421
1422 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1423
1424         if (is_seccomp_available())
1425                 return false;
1426
1427         log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1428         return true;
1429 }
1430
1431 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1432         uint32_t negative_action, default_action, action;
1433         int r;
1434
1435         assert(u);
1436         assert(c);
1437
1438         if (!context_has_syscall_filters(c))
1439                 return 0;
1440
1441         if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1442                 return 0;
1443
1444         negative_action = c->syscall_errno == 0 ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1445
1446         if (c->syscall_allow_list) {
1447                 default_action = negative_action;
1448                 action = SCMP_ACT_ALLOW;
1449         } else {
1450                 default_action = SCMP_ACT_ALLOW;
1451                 action = negative_action;
1452         }
1453
1454         if (needs_ambient_hack) {
1455                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1456                 if (r < 0)
1457                         return r;
1458         }
1459
1460         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1461 }
1462
1463 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1464         assert(u);
1465         assert(c);
1466
1467         if (set_isempty(c->syscall_archs))
1468                 return 0;
1469
1470         if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1471                 return 0;
1472
1473         return seccomp_restrict_archs(c->syscall_archs);
1474 }
1475
1476 static int apply_address_families(const Unit* u, const ExecContext *c) {
1477         assert(u);
1478         assert(c);
1479
1480         if (!context_has_address_families(c))
1481                 return 0;
1482
1483         if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1484                 return 0;
1485
1486         return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1487 }
1488
1489 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1490         assert(u);
1491         assert(c);
1492
1493         if (!c->memory_deny_write_execute)
1494                 return 0;
1495
1496         if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1497                 return 0;
1498
1499         return seccomp_memory_deny_write_execute();
1500 }
1501
1502 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1503         assert(u);
1504         assert(c);
1505
1506         if (!c->restrict_realtime)
1507                 return 0;
1508
1509         if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1510                 return 0;
1511
1512         return seccomp_restrict_realtime();
1513 }
1514
1515 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1516         assert(u);
1517         assert(c);
1518
1519         if (!c->restrict_suid_sgid)
1520                 return 0;
1521
1522         if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1523                 return 0;
1524
1525         return seccomp_restrict_suid_sgid();
1526 }
1527
1528 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1529         assert(u);
1530         assert(c);
1531
1532         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1533          * let's protect even those systems where this is left on in the kernel. */
1534
1535         if (!c->protect_kernel_tunables)
1536                 return 0;
1537
1538         if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1539                 return 0;
1540
1541         return seccomp_protect_sysctl();
1542 }
1543
1544 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1545         assert(u);
1546         assert(c);
1547
1548         /* Turn off module syscalls on ProtectKernelModules=yes */
1549
1550         if (!c->protect_kernel_modules)
1551                 return 0;
1552
1553         if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1554                 return 0;
1555
1556         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1557 }
1558
1559 static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1560         assert(u);
1561         assert(c);
1562
1563         if (!c->protect_kernel_logs)
1564                 return 0;
1565
1566         if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1567                 return 0;
1568
1569         return seccomp_protect_syslog();
1570 }
1571
1572 static int apply_protect_clock(const Unit *u, const ExecContext *c) {
1573         assert(u);
1574         assert(c);
1575
1576         if (!c->protect_clock)
1577                 return 0;
1578
1579         if (skip_seccomp_unavailable(u, "ProtectClock="))
1580                 return 0;
1581
1582         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1583 }
1584
1585 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1586         assert(u);
1587         assert(c);
1588
1589         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1590
1591         if (!c->private_devices)
1592                 return 0;
1593
1594         if (skip_seccomp_unavailable(u, "PrivateDevices="))
1595                 return 0;
1596
1597         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1598 }
1599
1600 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1601         assert(u);
1602         assert(c);
1603
1604         if (!exec_context_restrict_namespaces_set(c))
1605                 return 0;
1606
1607         if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1608                 return 0;
1609
1610         return seccomp_restrict_namespaces(c->restrict_namespaces);
1611 }
1612
1613 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1614         unsigned long personality;
1615         int r;
1616
1617         assert(u);
1618         assert(c);
1619
1620         if (!c->lock_personality)
1621                 return 0;
1622
1623         if (skip_seccomp_unavailable(u, "LockPersonality="))
1624                 return 0;
1625
1626         personality = c->personality;
1627
1628         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1629         if (personality == PERSONALITY_INVALID) {
1630
1631                 r = opinionated_personality(&personality);
1632                 if (r < 0)
1633                         return r;
1634         }
1635
1636         return seccomp_lock_personality(personality);
1637 }
1638
1639 #endif
1640
1641 static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
1642         assert(u);
1643         assert(c);
1644
1645         if (!c->protect_hostname)
1646                 return 0;
1647
1648         if (ns_type_supported(NAMESPACE_UTS)) {
1649                 if (unshare(CLONE_NEWUTS) < 0) {
1650                         if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1651                                 *ret_exit_status = EXIT_NAMESPACE;
1652                                 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1653                         }
1654
1655                         log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1656                 }
1657         } else
1658                 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1659
1660 #if HAVE_SECCOMP
1661         int r;
1662
1663         if (skip_seccomp_unavailable(u, "ProtectHostname="))
1664                 return 0;
1665
1666         r = seccomp_protect_hostname();
1667         if (r < 0) {
1668                 *ret_exit_status = EXIT_SECCOMP;
1669                 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1670         }
1671 #endif
1672
1673         return 0;
1674 }
1675
1676 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1677         assert(idle_pipe);
1678
1679         idle_pipe[1] = safe_close(idle_pipe[1]);
1680         idle_pipe[2] = safe_close(idle_pipe[2]);
1681
1682         if (idle_pipe[0] >= 0) {
1683                 int r;
1684
1685                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1686
1687                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1688                         ssize_t n;
1689
1690                         /* Signal systemd that we are bored and want to continue. */
1691                         n = write(idle_pipe[3], "x", 1);
1692                         if (n > 0)
1693                                 /* Wait for systemd to react to the signal above. */
1694                                 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1695                 }
1696
1697                 idle_pipe[0] = safe_close(idle_pipe[0]);
1698
1699         }
1700
1701         idle_pipe[3] = safe_close(idle_pipe[3]);
1702 }
1703
1704 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1705
1706 static int build_environment(
1707                 const Unit *u,
1708                 const ExecContext *c,
1709                 const ExecParameters *p,
1710                 size_t n_fds,
1711                 const char *home,
1712                 const char *username,
1713                 const char *shell,
1714                 dev_t journal_stream_dev,
1715                 ino_t journal_stream_ino,
1716                 char ***ret) {
1717
1718         _cleanup_strv_free_ char **our_env = NULL;
1719         ExecDirectoryType t;
1720         size_t n_env = 0;
1721         char *x;
1722
1723         assert(u);
1724         assert(c);
1725         assert(p);
1726         assert(ret);
1727
1728         our_env = new0(char*, 15 + _EXEC_DIRECTORY_TYPE_MAX);
1729         if (!our_env)
1730                 return -ENOMEM;
1731
1732         if (n_fds > 0) {
1733                 _cleanup_free_ char *joined = NULL;
1734
1735                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1736                         return -ENOMEM;
1737                 our_env[n_env++] = x;
1738
1739                 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1740                         return -ENOMEM;
1741                 our_env[n_env++] = x;
1742
1743                 joined = strv_join(p->fd_names, ":");
1744                 if (!joined)
1745                         return -ENOMEM;
1746
1747                 x = strjoin("LISTEN_FDNAMES=", joined);
1748                 if (!x)
1749                         return -ENOMEM;
1750                 our_env[n_env++] = x;
1751         }
1752
1753         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1754                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1755                         return -ENOMEM;
1756                 our_env[n_env++] = x;
1757
1758                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1759                         return -ENOMEM;
1760                 our_env[n_env++] = x;
1761         }
1762
1763         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1764          * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1765          * check the database directly. */
1766         if (p->flags & EXEC_NSS_BYPASS_BUS) {
1767                 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1768                 if (!x)
1769                         return -ENOMEM;
1770                 our_env[n_env++] = x;
1771         }
1772
1773         if (home) {
1774                 x = strjoin("HOME=", home);
1775                 if (!x)
1776                         return -ENOMEM;
1777
1778                 path_simplify(x + 5, true);
1779                 our_env[n_env++] = x;
1780         }
1781
1782         if (username) {
1783                 x = strjoin("LOGNAME=", username);
1784                 if (!x)
1785                         return -ENOMEM;
1786                 our_env[n_env++] = x;
1787
1788                 x = strjoin("USER=", username);
1789                 if (!x)
1790                         return -ENOMEM;
1791                 our_env[n_env++] = x;
1792         }
1793
1794         if (shell) {
1795                 x = strjoin("SHELL=", shell);
1796                 if (!x)
1797                         return -ENOMEM;
1798
1799                 path_simplify(x + 6, true);
1800                 our_env[n_env++] = x;
1801         }
1802
1803         if (!sd_id128_is_null(u->invocation_id)) {
1804                 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1805                         return -ENOMEM;
1806
1807                 our_env[n_env++] = x;
1808         }
1809
1810         if (exec_context_needs_term(c)) {
1811                 const char *tty_path, *term = NULL;
1812
1813                 tty_path = exec_context_tty_path(c);
1814
1815                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1816                  * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1817                  * container manager passes to PID 1 ends up all the way in the console login shown. */
1818
1819                 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
1820                         term = getenv("TERM");
1821
1822                 if (!term)
1823                         term = default_term_for_tty(tty_path);
1824
1825                 x = strjoin("TERM=", term);
1826                 if (!x)
1827                         return -ENOMEM;
1828                 our_env[n_env++] = x;
1829         }
1830
1831         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1832                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1833                         return -ENOMEM;
1834
1835                 our_env[n_env++] = x;
1836         }
1837
1838         if (c->log_namespace) {
1839                 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1840                 if (!x)
1841                         return -ENOMEM;
1842
1843                 our_env[n_env++] = x;
1844         }
1845
1846         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1847                 _cleanup_free_ char *pre = NULL, *joined = NULL;
1848                 const char *n;
1849
1850                 if (!p->prefix[t])
1851                         continue;
1852
1853                 if (strv_isempty(c->directories[t].paths))
1854                         continue;
1855
1856                 n = exec_directory_env_name_to_string(t);
1857                 if (!n)
1858                         continue;
1859
1860                 pre = strjoin(p->prefix[t], "/");
1861                 if (!pre)
1862                         return -ENOMEM;
1863
1864                 joined = strv_join_prefix(c->directories[t].paths, ":", pre);
1865                 if (!joined)
1866                         return -ENOMEM;
1867
1868                 x = strjoin(n, "=", joined);
1869                 if (!x)
1870                         return -ENOMEM;
1871
1872                 our_env[n_env++] = x;
1873         }
1874
1875         our_env[n_env++] = NULL;
1876         assert(n_env <= 14 + _EXEC_DIRECTORY_TYPE_MAX);
1877
1878         *ret = TAKE_PTR(our_env);
1879
1880         return 0;
1881 }
1882
1883 static int build_pass_environment(const ExecContext *c, char ***ret) {
1884         _cleanup_strv_free_ char **pass_env = NULL;
1885         size_t n_env = 0, n_bufsize = 0;
1886         char **i;
1887
1888         STRV_FOREACH(i, c->pass_environment) {
1889                 _cleanup_free_ char *x = NULL;
1890                 char *v;
1891
1892                 v = getenv(*i);
1893                 if (!v)
1894                         continue;
1895                 x = strjoin(*i, "=", v);
1896                 if (!x)
1897                         return -ENOMEM;
1898
1899                 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1900                         return -ENOMEM;
1901
1902                 pass_env[n_env++] = TAKE_PTR(x);
1903                 pass_env[n_env] = NULL;
1904         }
1905
1906         *ret = TAKE_PTR(pass_env);
1907
1908         return 0;
1909 }
1910
1911 static bool exec_needs_mount_namespace(
1912                 const ExecContext *context,
1913                 const ExecParameters *params,
1914                 const ExecRuntime *runtime) {
1915
1916         assert(context);
1917         assert(params);
1918
1919         if (context->root_image)
1920                 return true;
1921
1922         if (!strv_isempty(context->read_write_paths) ||
1923             !strv_isempty(context->read_only_paths) ||
1924             !strv_isempty(context->inaccessible_paths))
1925                 return true;
1926
1927         if (context->n_bind_mounts > 0)
1928                 return true;
1929
1930         if (context->n_temporary_filesystems > 0)
1931                 return true;
1932
1933         if (!IN_SET(context->mount_flags, 0, MS_SHARED))
1934                 return true;
1935
1936         if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
1937                 return true;
1938
1939         if (context->private_devices ||
1940             context->private_mounts ||
1941             context->protect_system != PROTECT_SYSTEM_NO ||
1942             context->protect_home != PROTECT_HOME_NO ||
1943             context->protect_kernel_tunables ||
1944             context->protect_kernel_modules ||
1945             context->protect_kernel_logs ||
1946             context->protect_control_groups)
1947                 return true;
1948
1949         if (context->root_directory) {
1950                 ExecDirectoryType t;
1951
1952                 if (context->mount_apivfs)
1953                         return true;
1954
1955                 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1956                         if (!params->prefix[t])
1957                                 continue;
1958
1959                         if (!strv_isempty(context->directories[t].paths))
1960                                 return true;
1961                 }
1962         }
1963
1964         if (context->dynamic_user &&
1965             (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
1966              !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
1967              !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
1968                 return true;
1969
1970         if (context->log_namespace)
1971                 return true;
1972
1973         return false;
1974 }
1975
1976 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
1977         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1978         _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
1979         _cleanup_close_ int unshare_ready_fd = -1;
1980         _cleanup_(sigkill_waitp) pid_t pid = 0;
1981         uint64_t c = 1;
1982         ssize_t n;
1983         int r;
1984
1985         /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
1986          * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
1987          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1988          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1989          * which waits for the parent to create the new user namespace while staying in the original namespace. The
1990          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1991          * continues execution normally.
1992          * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
1993          * does not need CAP_SETUID to write the single line mapping to itself. */
1994
1995         /* Can only set up multiple mappings with CAP_SETUID. */
1996         if (have_effective_cap(CAP_SETUID) && uid != ouid && uid_is_valid(uid))
1997                 r = asprintf(&uid_map,
1998                              UID_FMT " " UID_FMT " 1\n"     /* Map $OUID → $OUID */
1999                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
2000                              ouid, ouid, uid, uid);
2001         else
2002                 r = asprintf(&uid_map,
2003                              UID_FMT " " UID_FMT " 1\n",    /* Map $OUID → $OUID */
2004                              ouid, ouid);
2005
2006         if (r < 0)
2007                 return -ENOMEM;
2008
2009         /* Can only set up multiple mappings with CAP_SETGID. */
2010         if (have_effective_cap(CAP_SETGID) && gid != ogid && gid_is_valid(gid))
2011                 r = asprintf(&gid_map,
2012                              GID_FMT " " GID_FMT " 1\n"     /* Map $OGID → $OGID */
2013                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
2014                              ogid, ogid, gid, gid);
2015         else
2016                 r = asprintf(&gid_map,
2017                              GID_FMT " " GID_FMT " 1\n",    /* Map $OGID -> $OGID */
2018                              ogid, ogid);
2019
2020         if (r < 0)
2021                 return -ENOMEM;
2022
2023         /* Create a communication channel so that the parent can tell the child when it finished creating the user
2024          * namespace. */
2025         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2026         if (unshare_ready_fd < 0)
2027                 return -errno;
2028
2029         /* Create a communication channel so that the child can tell the parent a proper error code in case it
2030          * failed. */
2031         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2032                 return -errno;
2033
2034         r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2035         if (r < 0)
2036                 return r;
2037         if (r == 0) {
2038                 _cleanup_close_ int fd = -1;
2039                 const char *a;
2040                 pid_t ppid;
2041
2042                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2043                  * here, after the parent opened its own user namespace. */
2044
2045                 ppid = getppid();
2046                 errno_pipe[0] = safe_close(errno_pipe[0]);
2047
2048                 /* Wait until the parent unshared the user namespace */
2049                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2050                         r = -errno;
2051                         goto child_fail;
2052                 }
2053
2054                 /* Disable the setgroups() system call in the child user namespace, for good. */
2055                 a = procfs_file_alloca(ppid, "setgroups");
2056                 fd = open(a, O_WRONLY|O_CLOEXEC);
2057                 if (fd < 0) {
2058                         if (errno != ENOENT) {
2059                                 r = -errno;
2060                                 goto child_fail;
2061                         }
2062
2063                         /* If the file is missing the kernel is too old, let's continue anyway. */
2064                 } else {
2065                         if (write(fd, "deny\n", 5) < 0) {
2066                                 r = -errno;
2067                                 goto child_fail;
2068                         }
2069
2070                         fd = safe_close(fd);
2071                 }
2072
2073                 /* First write the GID map */
2074                 a = procfs_file_alloca(ppid, "gid_map");
2075                 fd = open(a, O_WRONLY|O_CLOEXEC);
2076                 if (fd < 0) {
2077                         r = -errno;
2078                         goto child_fail;
2079                 }
2080                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2081                         r = -errno;
2082                         goto child_fail;
2083                 }
2084                 fd = safe_close(fd);
2085
2086                 /* The write the UID map */
2087                 a = procfs_file_alloca(ppid, "uid_map");
2088                 fd = open(a, O_WRONLY|O_CLOEXEC);
2089                 if (fd < 0) {
2090                         r = -errno;
2091                         goto child_fail;
2092                 }
2093                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2094                         r = -errno;
2095                         goto child_fail;
2096                 }
2097
2098                 _exit(EXIT_SUCCESS);
2099
2100         child_fail:
2101                 (void) write(errno_pipe[1], &r, sizeof(r));
2102                 _exit(EXIT_FAILURE);
2103         }
2104
2105         errno_pipe[1] = safe_close(errno_pipe[1]);
2106
2107         if (unshare(CLONE_NEWUSER) < 0)
2108                 return -errno;
2109
2110         /* Let the child know that the namespace is ready now */
2111         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2112                 return -errno;
2113
2114         /* Try to read an error code from the child */
2115         n = read(errno_pipe[0], &r, sizeof(r));
2116         if (n < 0)
2117                 return -errno;
2118         if (n == sizeof(r)) { /* an error code was sent to us */
2119                 if (r < 0)
2120                         return r;
2121                 return -EIO;
2122         }
2123         if (n != 0) /* on success we should have read 0 bytes */
2124                 return -EIO;
2125
2126         r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
2127         pid = 0;
2128         if (r < 0)
2129                 return r;
2130         if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2131                 return -EIO;
2132
2133         return 0;
2134 }
2135
2136 static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2137         if (!context->dynamic_user)
2138                 return false;
2139
2140         if (type == EXEC_DIRECTORY_CONFIGURATION)
2141                 return false;
2142
2143         if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2144                 return false;
2145
2146         return true;
2147 }
2148
2149 static int setup_exec_directory(
2150                 const ExecContext *context,
2151                 const ExecParameters *params,
2152                 uid_t uid,
2153                 gid_t gid,
2154                 ExecDirectoryType type,
2155                 int *exit_status) {
2156
2157         static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2158                 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2159                 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2160                 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2161                 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2162                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2163         };
2164         char **rt;
2165         int r;
2166
2167         assert(context);
2168         assert(params);
2169         assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2170         assert(exit_status);
2171
2172         if (!params->prefix[type])
2173                 return 0;
2174
2175         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2176                 if (!uid_is_valid(uid))
2177                         uid = 0;
2178                 if (!gid_is_valid(gid))
2179                         gid = 0;
2180         }
2181
2182         STRV_FOREACH(rt, context->directories[type].paths) {
2183                 _cleanup_free_ char *p = NULL, *pp = NULL;
2184
2185                 p = path_join(params->prefix[type], *rt);
2186                 if (!p) {
2187                         r = -ENOMEM;
2188                         goto fail;
2189                 }
2190
2191                 r = mkdir_parents_label(p, 0755);
2192                 if (r < 0)
2193                         goto fail;
2194
2195                 if (exec_directory_is_private(context, type)) {
2196                         _cleanup_free_ char *private_root = NULL;
2197
2198                         /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2199                          * case we want to avoid leaving a directory around fully accessible that is owned by
2200                          * a dynamic user whose UID is later on reused. To lock this down we use the same
2201                          * trick used by container managers to prohibit host users to get access to files of
2202                          * the same UID in containers: we place everything inside a directory that has an
2203                          * access mode of 0700 and is owned root:root, so that it acts as security boundary
2204                          * for unprivileged host code. We then use fs namespacing to make this directory
2205                          * permeable for the service itself.
2206                          *
2207                          * Specifically: for a service which wants a special directory "foo/" we first create
2208                          * a directory "private/" with access mode 0700 owned by root:root. Then we place
2209                          * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2210                          * "private/foo". This way, privileged host users can access "foo/" as usual, but
2211                          * unprivileged host users can't look into it. Inside of the namespace of the unit
2212                          * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2213                          * "private/foo/" is mounted under the same name, thus disabling the access boundary
2214                          * for the service and making sure it only gets access to the dirs it needs but no
2215                          * others. Tricky? Yes, absolutely, but it works!
2216                          *
2217                          * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2218                          * to be owned by the service itself.
2219                          *
2220                          * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2221                          * for sharing files or sockets with other services. */
2222
2223                         private_root = path_join(params->prefix[type], "private");
2224                         if (!private_root) {
2225                                 r = -ENOMEM;
2226                                 goto fail;
2227                         }
2228
2229                         /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2230                         r = mkdir_safe_label(private_root, 0700, 0, 0, MKDIR_WARN_MODE);
2231                         if (r < 0)
2232                                 goto fail;
2233
2234                         pp = path_join(private_root, *rt);
2235                         if (!pp) {
2236                                 r = -ENOMEM;
2237                                 goto fail;
2238                         }
2239
2240                         /* Create all directories between the configured directory and this private root, and mark them 0755 */
2241                         r = mkdir_parents_label(pp, 0755);
2242                         if (r < 0)
2243                                 goto fail;
2244
2245                         if (is_dir(p, false) > 0 &&
2246                             (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2247
2248                                 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2249                                  * it over. Most likely the service has been upgraded from one that didn't use
2250                                  * DynamicUser=1, to one that does. */
2251
2252                                 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2253                                          "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2254                                          exec_directory_type_to_string(type), p, pp);
2255
2256                                 if (rename(p, pp) < 0) {
2257                                         r = -errno;
2258                                         goto fail;
2259                                 }
2260                         } else {
2261                                 /* Otherwise, create the actual directory for the service */
2262
2263                                 r = mkdir_label(pp, context->directories[type].mode);
2264                                 if (r < 0 && r != -EEXIST)
2265                                         goto fail;
2266                         }
2267
2268                         /* And link it up from the original place */
2269                         r = symlink_idempotent(pp, p, true);
2270                         if (r < 0)
2271                                 goto fail;
2272
2273                 } else {
2274                         _cleanup_free_ char *target = NULL;
2275
2276                         if (type != EXEC_DIRECTORY_CONFIGURATION &&
2277                             readlink_and_make_absolute(p, &target) >= 0) {
2278                                 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2279
2280                                 /* This already exists and is a symlink? Interesting. Maybe it's one created
2281                                  * by DynamicUser=1 (see above)?
2282                                  *
2283                                  * We do this for all directory types except for ConfigurationDirectory=,
2284                                  * since they all support the private/ symlink logic at least in some
2285                                  * configurations, see above. */
2286
2287                                 r = chase_symlinks(target, NULL, 0, &target_resolved, NULL);
2288                                 if (r < 0)
2289                                         goto fail;
2290
2291                                 q = path_join(params->prefix[type], "private", *rt);
2292                                 if (!q) {
2293                                         r = -ENOMEM;
2294                                         goto fail;
2295                                 }
2296
2297                                 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2298                                 r = chase_symlinks(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2299                                 if (r < 0)
2300                                         goto fail;
2301
2302                                 if (path_equal(q_resolved, target_resolved)) {
2303
2304                                         /* Hmm, apparently DynamicUser= was once turned on for this service,
2305                                          * but is no longer. Let's move the directory back up. */
2306
2307                                         log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2308                                                  "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2309                                                  exec_directory_type_to_string(type), q, p);
2310
2311                                         if (unlink(p) < 0) {
2312                                                 r = -errno;
2313                                                 goto fail;
2314                                         }
2315
2316                                         if (rename(q, p) < 0) {
2317                                                 r = -errno;
2318                                                 goto fail;
2319                                         }
2320                                 }
2321                         }
2322
2323                         r = mkdir_label(p, context->directories[type].mode);
2324                         if (r < 0) {
2325                                 if (r != -EEXIST)
2326                                         goto fail;
2327
2328                                 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2329                                         struct stat st;
2330
2331                                         /* Don't change the owner/access mode of the configuration directory,
2332                                          * as in the common case it is not written to by a service, and shall
2333                                          * not be writable. */
2334
2335                                         if (stat(p, &st) < 0) {
2336                                                 r = -errno;
2337                                                 goto fail;
2338                                         }
2339
2340                                         /* Still complain if the access mode doesn't match */
2341                                         if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2342                                                 log_warning("%s \'%s\' already exists but the mode is different. "
2343                                                             "(File system: %o %sMode: %o)",
2344                                                             exec_directory_type_to_string(type), *rt,
2345                                                             st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2346
2347                                         continue;
2348                                 }
2349                         }
2350                 }
2351
2352                 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2353                  * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2354                  * current UID/GID ownership.) */
2355                 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2356                 if (r < 0)
2357                         goto fail;
2358
2359                 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2360                  * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2361                  * assignments to exist.*/
2362                 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
2363                 if (r < 0)
2364                         goto fail;
2365         }
2366
2367         return 0;
2368
2369 fail:
2370         *exit_status = exit_status_table[type];
2371         return r;
2372 }
2373
2374 #if ENABLE_SMACK
2375 static int setup_smack(
2376                 const ExecContext *context,
2377                 const ExecCommand *command) {
2378
2379         int r;
2380
2381         assert(context);
2382         assert(command);
2383
2384         if (context->smack_process_label) {
2385                 r = mac_smack_apply_pid(0, context->smack_process_label);
2386                 if (r < 0)
2387                         return r;
2388         }
2389 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2390         else {
2391                 _cleanup_free_ char *exec_label = NULL;
2392
2393                 r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
2394                 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
2395                         return r;
2396
2397                 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2398                 if (r < 0)
2399                         return r;
2400         }
2401 #endif
2402
2403         return 0;
2404 }
2405 #endif
2406
2407 static int compile_bind_mounts(
2408                 const ExecContext *context,
2409                 const ExecParameters *params,
2410                 BindMount **ret_bind_mounts,
2411                 size_t *ret_n_bind_mounts,
2412                 char ***ret_empty_directories) {
2413
2414         _cleanup_strv_free_ char **empty_directories = NULL;
2415         BindMount *bind_mounts;
2416         size_t n, h = 0, i;
2417         ExecDirectoryType t;
2418         int r;
2419
2420         assert(context);
2421         assert(params);
2422         assert(ret_bind_mounts);
2423         assert(ret_n_bind_mounts);
2424         assert(ret_empty_directories);
2425
2426         n = context->n_bind_mounts;
2427         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2428                 if (!params->prefix[t])
2429                         continue;
2430
2431                 n += strv_length(context->directories[t].paths);
2432         }
2433
2434         if (n <= 0) {
2435                 *ret_bind_mounts = NULL;
2436                 *ret_n_bind_mounts = 0;
2437                 *ret_empty_directories = NULL;
2438                 return 0;
2439         }
2440
2441         bind_mounts = new(BindMount, n);
2442         if (!bind_mounts)
2443                 return -ENOMEM;
2444
2445         for (i = 0; i < context->n_bind_mounts; i++) {
2446                 BindMount *item = context->bind_mounts + i;
2447                 char *s, *d;
2448
2449                 s = strdup(item->source);
2450                 if (!s) {
2451                         r = -ENOMEM;
2452                         goto finish;
2453                 }
2454
2455                 d = strdup(item->destination);
2456                 if (!d) {
2457                         free(s);
2458                         r = -ENOMEM;
2459                         goto finish;
2460                 }
2461
2462                 bind_mounts[h++] = (BindMount) {
2463                         .source = s,
2464                         .destination = d,
2465                         .read_only = item->read_only,
2466                         .recursive = item->recursive,
2467                         .ignore_enoent = item->ignore_enoent,
2468                 };
2469         }
2470
2471         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2472                 char **suffix;
2473
2474                 if (!params->prefix[t])
2475                         continue;
2476
2477                 if (strv_isempty(context->directories[t].paths))
2478                         continue;
2479
2480                 if (exec_directory_is_private(context, t) &&
2481                     !(context->root_directory || context->root_image)) {
2482                         char *private_root;
2483
2484                         /* So this is for a dynamic user, and we need to make sure the process can access its own
2485                          * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2486                          * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2487
2488                         private_root = path_join(params->prefix[t], "private");
2489                         if (!private_root) {
2490                                 r = -ENOMEM;
2491                                 goto finish;
2492                         }
2493
2494                         r = strv_consume(&empty_directories, private_root);
2495                         if (r < 0)
2496                                 goto finish;
2497                 }
2498
2499                 STRV_FOREACH(suffix, context->directories[t].paths) {
2500                         char *s, *d;
2501
2502                         if (exec_directory_is_private(context, t))
2503                                 s = path_join(params->prefix[t], "private", *suffix);
2504                         else
2505                                 s = path_join(params->prefix[t], *suffix);
2506                         if (!s) {
2507                                 r = -ENOMEM;
2508                                 goto finish;
2509                         }
2510
2511                         if (exec_directory_is_private(context, t) &&
2512                             (context->root_directory || context->root_image))
2513                                 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2514                                  * directory is not created on the root directory. So, let's bind-mount the directory
2515                                  * on the 'non-private' place. */
2516                                 d = path_join(params->prefix[t], *suffix);
2517                         else
2518                                 d = strdup(s);
2519                         if (!d) {
2520                                 free(s);
2521                                 r = -ENOMEM;
2522                                 goto finish;
2523                         }
2524
2525                         bind_mounts[h++] = (BindMount) {
2526                                 .source = s,
2527                                 .destination = d,
2528                                 .read_only = false,
2529                                 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
2530                                 .recursive = true,
2531                                 .ignore_enoent = false,
2532                         };
2533                 }
2534         }
2535
2536         assert(h == n);
2537
2538         *ret_bind_mounts = bind_mounts;
2539         *ret_n_bind_mounts = n;
2540         *ret_empty_directories = TAKE_PTR(empty_directories);
2541
2542         return (int) n;
2543
2544 finish:
2545         bind_mount_free_many(bind_mounts, h);
2546         return r;
2547 }
2548
2549 static bool insist_on_sandboxing(
2550                 const ExecContext *context,
2551                 const char *root_dir,
2552                 const char *root_image,
2553                 const BindMount *bind_mounts,
2554                 size_t n_bind_mounts) {
2555
2556         size_t i;
2557
2558         assert(context);
2559         assert(n_bind_mounts == 0 || bind_mounts);
2560
2561         /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
2562          * would alter the view on the file system beyond making things read-only or invisible, i.e. would
2563          * rearrange stuff in a way we cannot ignore gracefully. */
2564
2565         if (context->n_temporary_filesystems > 0)
2566                 return true;
2567
2568         if (root_dir || root_image)
2569                 return true;
2570
2571         if (context->dynamic_user)
2572                 return true;
2573
2574         /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
2575          * essential. */
2576         for (i = 0; i < n_bind_mounts; i++)
2577                 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
2578                         return true;
2579
2580         if (context->log_namespace)
2581                 return true;
2582
2583         return false;
2584 }
2585
2586 static int apply_mount_namespace(
2587                 const Unit *u,
2588                 const ExecCommand *command,
2589                 const ExecContext *context,
2590                 const ExecParameters *params,
2591                 const ExecRuntime *runtime,
2592                 char **error_path) {
2593
2594         _cleanup_strv_free_ char **empty_directories = NULL;
2595         char *tmp = NULL, *var = NULL;
2596         const char *root_dir = NULL, *root_image = NULL;
2597         NamespaceInfo ns_info;
2598         bool needs_sandboxing;
2599         BindMount *bind_mounts = NULL;
2600         size_t n_bind_mounts = 0;
2601         int r;
2602
2603         assert(context);
2604
2605         if (params->flags & EXEC_APPLY_CHROOT) {
2606                 root_image = context->root_image;
2607
2608                 if (!root_image)
2609                         root_dir = context->root_directory;
2610         }
2611
2612         r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
2613         if (r < 0)
2614                 return r;
2615
2616         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2617         if (needs_sandboxing) {
2618                 /* The runtime struct only contains the parent of the private /tmp,
2619                  * which is non-accessible to world users. Inside of it there's a /tmp
2620                  * that is sticky, and that's the one we want to use here. */
2621
2622                 if (context->private_tmp && runtime) {
2623                         if (runtime->tmp_dir)
2624                                 tmp = strjoina(runtime->tmp_dir, "/tmp");
2625                         if (runtime->var_tmp_dir)
2626                                 var = strjoina(runtime->var_tmp_dir, "/tmp");
2627                 }
2628
2629                 ns_info = (NamespaceInfo) {
2630                         .ignore_protect_paths = false,
2631                         .private_dev = context->private_devices,
2632                         .protect_control_groups = context->protect_control_groups,
2633                         .protect_kernel_tunables = context->protect_kernel_tunables,
2634                         .protect_kernel_modules = context->protect_kernel_modules,
2635                         .protect_kernel_logs = context->protect_kernel_logs,
2636                         .protect_hostname = context->protect_hostname,
2637                         .mount_apivfs = context->mount_apivfs,
2638                         .private_mounts = context->private_mounts,
2639                 };
2640         } else if (!context->dynamic_user && root_dir)
2641                 /*
2642                  * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2643                  * sandbox info, otherwise enforce it, don't ignore protected paths and
2644                  * fail if we are enable to apply the sandbox inside the mount namespace.
2645                  */
2646                 ns_info = (NamespaceInfo) {
2647                         .ignore_protect_paths = true,
2648                 };
2649         else
2650                 ns_info = (NamespaceInfo) {};
2651
2652         if (context->mount_flags == MS_SHARED)
2653                 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
2654
2655         r = setup_namespace(root_dir, root_image,
2656                             &ns_info, context->read_write_paths,
2657                             needs_sandboxing ? context->read_only_paths : NULL,
2658                             needs_sandboxing ? context->inaccessible_paths : NULL,
2659                             empty_directories,
2660                             bind_mounts,
2661                             n_bind_mounts,
2662                             context->temporary_filesystems,
2663                             context->n_temporary_filesystems,
2664                             tmp,
2665                             var,
2666                             context->log_namespace,
2667                             needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
2668                             needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
2669                             context->mount_flags,
2670                             context->root_hash, context->root_hash_size, context->root_hash_path,
2671                             context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
2672                             context->root_verity,
2673                             DISSECT_IMAGE_DISCARD_ON_LOOP|DISSECT_IMAGE_RELAX_VAR_CHECK|DISSECT_IMAGE_FSCK,
2674                             error_path);
2675
2676         /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
2677          * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
2678          * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
2679          * completely different execution environment. */
2680         if (r == -ENOANO) {
2681                 if (insist_on_sandboxing(
2682                                     context,
2683                                     root_dir, root_image,
2684                                     bind_mounts,
2685                                     n_bind_mounts)) {
2686                         log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
2687                                        "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
2688                                        n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
2689
2690                         r = -EOPNOTSUPP;
2691                 } else {
2692                         log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
2693                         r = 0;
2694                 }
2695         }
2696
2697         bind_mount_free_many(bind_mounts, n_bind_mounts);
2698         return r;
2699 }
2700
2701 static int apply_working_directory(
2702                 const ExecContext *context,
2703                 const ExecParameters *params,
2704                 const char *home,
2705                 int *exit_status) {
2706
2707         const char *d, *wd;
2708
2709         assert(context);
2710         assert(exit_status);
2711
2712         if (context->working_directory_home) {
2713
2714                 if (!home) {
2715                         *exit_status = EXIT_CHDIR;
2716                         return -ENXIO;
2717                 }
2718
2719                 wd = home;
2720
2721         } else if (context->working_directory)
2722                 wd = context->working_directory;
2723         else
2724                 wd = "/";
2725
2726         if (params->flags & EXEC_APPLY_CHROOT)
2727                 d = wd;
2728         else
2729                 d = prefix_roota(context->root_directory, wd);
2730
2731         if (chdir(d) < 0 && !context->working_directory_missing_ok) {
2732                 *exit_status = EXIT_CHDIR;
2733                 return -errno;
2734         }
2735
2736         return 0;
2737 }
2738
2739 static int apply_root_directory(
2740                 const ExecContext *context,
2741                 const ExecParameters *params,
2742                 const bool needs_mount_ns,
2743                 int *exit_status) {
2744
2745         assert(context);
2746         assert(exit_status);
2747
2748         if (params->flags & EXEC_APPLY_CHROOT) {
2749                 if (!needs_mount_ns && context->root_directory)
2750                         if (chroot(context->root_directory) < 0) {
2751                                 *exit_status = EXIT_CHROOT;
2752                                 return -errno;
2753                         }
2754         }
2755
2756         return 0;
2757 }
2758
2759 static int setup_keyring(
2760                 const Unit *u,
2761                 const ExecContext *context,
2762                 const ExecParameters *p,
2763                 uid_t uid, gid_t gid) {
2764
2765         key_serial_t keyring;
2766         int r = 0;
2767         uid_t saved_uid;
2768         gid_t saved_gid;
2769
2770         assert(u);
2771         assert(context);
2772         assert(p);
2773
2774         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2775          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2776          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2777          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2778          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2779          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2780
2781         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
2782                 return 0;
2783
2784         /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
2785          * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
2786          * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
2787          * & group is just as nasty as acquiring a reference to the user keyring. */
2788
2789         saved_uid = getuid();
2790         saved_gid = getgid();
2791
2792         if (gid_is_valid(gid) && gid != saved_gid) {
2793                 if (setregid(gid, -1) < 0)
2794                         return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
2795         }
2796
2797         if (uid_is_valid(uid) && uid != saved_uid) {
2798                 if (setreuid(uid, -1) < 0) {
2799                         r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
2800                         goto out;
2801                 }
2802         }
2803
2804         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2805         if (keyring == -1) {
2806                 if (errno == ENOSYS)
2807                         log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
2808                 else if (IN_SET(errno, EACCES, EPERM))
2809                         log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
2810                 else if (errno == EDQUOT)
2811                         log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
2812                 else
2813                         r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
2814
2815                 goto out;
2816         }
2817
2818         /* When requested link the user keyring into the session keyring. */
2819         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
2820
2821                 if (keyctl(KEYCTL_LINK,
2822                            KEY_SPEC_USER_KEYRING,
2823                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
2824                         r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
2825                         goto out;
2826                 }
2827         }
2828
2829         /* Restore uid/gid back */
2830         if (uid_is_valid(uid) && uid != saved_uid) {
2831                 if (setreuid(saved_uid, -1) < 0) {
2832                         r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
2833                         goto out;
2834                 }
2835         }
2836
2837         if (gid_is_valid(gid) && gid != saved_gid) {
2838                 if (setregid(saved_gid, -1) < 0)
2839                         return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
2840         }
2841
2842         /* Populate they keyring with the invocation ID by default, as original saved_uid. */
2843         if (!sd_id128_is_null(u->invocation_id)) {
2844                 key_serial_t key;
2845
2846                 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
2847                 if (key == -1)
2848                         log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
2849                 else {
2850                         if (keyctl(KEYCTL_SETPERM, key,
2851                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
2852                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
2853                                 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
2854                 }
2855         }
2856
2857 out:
2858         /* Revert back uid & gid for the last time, and exit */
2859         /* no extra logging, as only the first already reported error matters */
2860         if (getuid() != saved_uid)
2861                 (void) setreuid(saved_uid, -1);
2862
2863         if (getgid() != saved_gid)
2864                 (void) setregid(saved_gid, -1);
2865
2866         return r;
2867 }
2868
2869 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
2870         assert(array);
2871         assert(n);
2872         assert(pair);
2873
2874         if (pair[0] >= 0)
2875                 array[(*n)++] = pair[0];
2876         if (pair[1] >= 0)
2877                 array[(*n)++] = pair[1];
2878 }
2879
2880 static int close_remaining_fds(
2881                 const ExecParameters *params,
2882                 const ExecRuntime *runtime,
2883                 const DynamicCreds *dcreds,
2884                 int user_lookup_fd,
2885                 int socket_fd,
2886                 int exec_fd,
2887                 const int *fds, size_t n_fds) {
2888
2889         size_t n_dont_close = 0;
2890         int dont_close[n_fds + 12];
2891
2892         assert(params);
2893
2894         if (params->stdin_fd >= 0)
2895                 dont_close[n_dont_close++] = params->stdin_fd;
2896         if (params->stdout_fd >= 0)
2897                 dont_close[n_dont_close++] = params->stdout_fd;
2898         if (params->stderr_fd >= 0)
2899                 dont_close[n_dont_close++] = params->stderr_fd;
2900
2901         if (socket_fd >= 0)
2902                 dont_close[n_dont_close++] = socket_fd;
2903         if (exec_fd >= 0)
2904                 dont_close[n_dont_close++] = exec_fd;
2905         if (n_fds > 0) {
2906                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
2907                 n_dont_close += n_fds;
2908         }
2909
2910         if (runtime)
2911                 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
2912
2913         if (dcreds) {
2914                 if (dcreds->user)
2915                         append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
2916                 if (dcreds->group)
2917                         append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
2918         }
2919
2920         if (user_lookup_fd >= 0)
2921                 dont_close[n_dont_close++] = user_lookup_fd;
2922
2923         return close_all_fds(dont_close, n_dont_close);
2924 }
2925
2926 static int send_user_lookup(
2927                 Unit *unit,
2928                 int user_lookup_fd,
2929                 uid_t uid,
2930                 gid_t gid) {
2931
2932         assert(unit);
2933
2934         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2935          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2936          * specified. */
2937
2938         if (user_lookup_fd < 0)
2939                 return 0;
2940
2941         if (!uid_is_valid(uid) && !gid_is_valid(gid))
2942                 return 0;
2943
2944         if (writev(user_lookup_fd,
2945                (struct iovec[]) {
2946                            IOVEC_INIT(&uid, sizeof(uid)),
2947                            IOVEC_INIT(&gid, sizeof(gid)),
2948                            IOVEC_INIT_STRING(unit->id) }, 3) < 0)
2949                 return -errno;
2950
2951         return 0;
2952 }
2953
2954 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
2955         int r;
2956
2957         assert(c);
2958         assert(home);
2959         assert(buf);
2960
2961         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2962
2963         if (*home)
2964                 return 0;
2965
2966         if (!c->working_directory_home)
2967                 return 0;
2968
2969         r = get_home_dir(buf);
2970         if (r < 0)
2971                 return r;
2972
2973         *home = *buf;
2974         return 1;
2975 }
2976
2977 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
2978         _cleanup_strv_free_ char ** list = NULL;
2979         ExecDirectoryType t;
2980         int r;
2981
2982         assert(c);
2983         assert(p);
2984         assert(ret);
2985
2986         assert(c->dynamic_user);
2987
2988         /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2989          * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2990          * directories. */
2991
2992         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2993                 char **i;
2994
2995                 if (t == EXEC_DIRECTORY_CONFIGURATION)
2996                         continue;
2997
2998                 if (!p->prefix[t])
2999                         continue;
3000
3001                 STRV_FOREACH(i, c->directories[t].paths) {
3002                         char *e;
3003
3004                         if (exec_directory_is_private(c, t))
3005                                 e = path_join(p->prefix[t], "private", *i);
3006                         else
3007                                 e = path_join(p->prefix[t], *i);
3008                         if (!e)
3009                                 return -ENOMEM;
3010
3011                         r = strv_consume(&list, e);
3012                         if (r < 0)
3013                                 return r;
3014                 }
3015         }
3016
3017         *ret = TAKE_PTR(list);
3018
3019         return 0;
3020 }
3021
3022 static char *exec_command_line(char **argv);
3023
3024 static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
3025         bool using_subcgroup;
3026         char *p;
3027
3028         assert(params);
3029         assert(ret);
3030
3031         if (!params->cgroup_path)
3032                 return -EINVAL;
3033
3034         /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
3035          * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
3036          * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
3037          * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
3038          * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
3039          * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
3040          * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
3041          * flag, which is only passed for the former statements, not for the latter. */
3042
3043         using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
3044         if (using_subcgroup)
3045                 p = path_join(params->cgroup_path, ".control");
3046         else
3047                 p = strdup(params->cgroup_path);
3048         if (!p)
3049                 return -ENOMEM;
3050
3051         *ret = p;
3052         return using_subcgroup;
3053 }
3054
3055 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
3056         _cleanup_(cpu_set_reset) CPUSet s = {};
3057         int r;
3058
3059         assert(c);
3060         assert(ret);
3061
3062         if (!c->numa_policy.nodes.set) {
3063                 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3064                 return 0;
3065         }
3066
3067         r = numa_to_cpu_set(&c->numa_policy, &s);
3068         if (r < 0)
3069                 return r;
3070
3071         cpu_set_reset(ret);
3072
3073         return cpu_set_add_all(ret, &s);
3074 }
3075
3076 bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
3077         assert(c);
3078
3079         return c->cpu_affinity_from_numa;
3080 }
3081
3082 static int exec_child(
3083                 Unit *unit,
3084                 const ExecCommand *command,
3085                 const ExecContext *context,
3086                 const ExecParameters *params,
3087                 ExecRuntime *runtime,
3088                 DynamicCreds *dcreds,
3089                 int socket_fd,
3090                 const int named_iofds[static 3],
3091                 int *fds,
3092                 size_t n_socket_fds,
3093                 size_t n_storage_fds,
3094                 char **files_env,
3095                 int user_lookup_fd,
3096                 int *exit_status) {
3097
3098         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **replaced_argv = NULL;
3099         int *fds_with_exec_fd, n_fds_with_exec_fd, r, ngids = 0, exec_fd = -1;
3100         _cleanup_free_ gid_t *supplementary_gids = NULL;
3101         const char *username = NULL, *groupname = NULL;
3102         _cleanup_free_ char *home_buffer = NULL;
3103         const char *home = NULL, *shell = NULL;
3104         char **final_argv = NULL;
3105         dev_t journal_stream_dev = 0;
3106         ino_t journal_stream_ino = 0;
3107         bool userns_set_up = false;
3108         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
3109                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
3110                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
3111                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
3112 #if HAVE_SELINUX
3113         _cleanup_free_ char *mac_selinux_context_net = NULL;
3114         bool use_selinux = false;
3115 #endif
3116 #if ENABLE_SMACK
3117         bool use_smack = false;
3118 #endif
3119 #if HAVE_APPARMOR
3120         bool use_apparmor = false;
3121 #endif
3122         uid_t saved_uid = getuid();
3123         gid_t saved_gid = getgid();
3124         uid_t uid = UID_INVALID;
3125         gid_t gid = GID_INVALID;
3126         size_t n_fds;
3127         ExecDirectoryType dt;
3128         int secure_bits;
3129         _cleanup_free_ gid_t *gids_after_pam = NULL;
3130         int ngids_after_pam = 0;
3131
3132         assert(unit);
3133         assert(command);
3134         assert(context);
3135         assert(params);
3136         assert(exit_status);
3137
3138         rename_process_from_path(command->path);
3139
3140         /* We reset exactly these signals, since they are the
3141          * only ones we set to SIG_IGN in the main daemon. All
3142          * others we leave untouched because we set them to
3143          * SIG_DFL or a valid handler initially, both of which
3144          * will be demoted to SIG_DFL. */
3145         (void) default_signals(SIGNALS_CRASH_HANDLER,
3146                                SIGNALS_IGNORE, -1);
3147
3148         if (context->ignore_sigpipe)
3149                 (void) ignore_signals(SIGPIPE, -1);
3150
3151         r = reset_signal_mask();
3152         if (r < 0) {
3153                 *exit_status = EXIT_SIGNAL_MASK;
3154                 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
3155         }
3156
3157         if (params->idle_pipe)
3158                 do_idle_pipe_dance(params->idle_pipe);
3159
3160         /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
3161          * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
3162          * any fds open we don't really want open during the transition. In order to make logging work, we switch the
3163          * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
3164
3165         log_forget_fds();
3166         log_set_open_when_needed(true);
3167
3168         /* In case anything used libc syslog(), close this here, too */
3169         closelog();
3170
3171         n_fds = n_socket_fds + n_storage_fds;
3172         r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, params->exec_fd, fds, n_fds);
3173         if (r < 0) {
3174                 *exit_status = EXIT_FDS;
3175                 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
3176         }
3177
3178         if (!context->same_pgrp)
3179                 if (setsid() < 0) {
3180                         *exit_status = EXIT_SETSID;
3181                         return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
3182                 }
3183
3184         exec_context_tty_reset(context, params);
3185
3186         if (unit_shall_confirm_spawn(unit)) {
3187                 const char *vc = params->confirm_spawn;
3188                 _cleanup_free_ char *cmdline = NULL;
3189
3190                 cmdline = exec_command_line(command->argv);
3191                 if (!cmdline) {
3192                         *exit_status = EXIT_MEMORY;
3193                         return log_oom();
3194                 }
3195
3196                 r = ask_for_confirmation(vc, unit, cmdline);
3197                 if (r != CONFIRM_EXECUTE) {
3198                         if (r == CONFIRM_PRETEND_SUCCESS) {
3199                                 *exit_status = EXIT_SUCCESS;
3200                                 return 0;
3201                         }
3202                         *exit_status = EXIT_CONFIRM;
3203                         log_unit_error(unit, "Execution cancelled by the user");
3204                         return -ECANCELED;
3205                 }
3206         }
3207
3208         /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
3209          * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
3210          * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
3211          * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
3212          * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
3213         if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
3214             setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
3215                 *exit_status = EXIT_MEMORY;
3216                 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3217         }
3218
3219         if (context->dynamic_user && dcreds) {
3220                 _cleanup_strv_free_ char **suggested_paths = NULL;
3221
3222                 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
3223                  * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here.*/
3224                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
3225                         *exit_status = EXIT_USER;
3226                         return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3227                 }
3228
3229                 r = compile_suggested_paths(context, params, &suggested_paths);
3230                 if (r < 0) {
3231                         *exit_status = EXIT_MEMORY;
3232                         return log_oom();
3233                 }
3234
3235                 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
3236                 if (r < 0) {
3237                         *exit_status = EXIT_USER;
3238                         if (r == -EILSEQ) {
3239                                 log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists.");
3240                                 return -EOPNOTSUPP;
3241                         }
3242                         return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
3243                 }
3244
3245                 if (!uid_is_valid(uid)) {
3246                         *exit_status = EXIT_USER;
3247                         log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
3248                         return -ESRCH;
3249                 }
3250
3251                 if (!gid_is_valid(gid)) {
3252                         *exit_status = EXIT_USER;
3253                         log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
3254                         return -ESRCH;
3255                 }
3256
3257                 if (dcreds->user)
3258                         username = dcreds->user->name;
3259
3260         } else {
3261                 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
3262                 if (r < 0) {
3263                         *exit_status = EXIT_USER;
3264                         return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
3265                 }
3266
3267                 r = get_fixed_group(context, &groupname, &gid);
3268                 if (r < 0) {
3269                         *exit_status = EXIT_GROUP;
3270                         return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
3271                 }
3272         }
3273
3274         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
3275         r = get_supplementary_groups(context, username, groupname, gid,
3276                                      &supplementary_gids, &ngids);
3277         if (r < 0) {
3278                 *exit_status = EXIT_GROUP;
3279                 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
3280         }
3281
3282         r = send_user_lookup(unit, user_lookup_fd, uid, gid);
3283         if (r < 0) {
3284                 *exit_status = EXIT_USER;
3285                 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
3286         }
3287
3288         user_lookup_fd = safe_close(user_lookup_fd);
3289
3290         r = acquire_home(context, uid, &home, &home_buffer);
3291         if (r < 0) {
3292                 *exit_status = EXIT_CHDIR;
3293                 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
3294         }
3295
3296         /* If a socket is connected to STDIN/STDOUT/STDERR, we
3297          * must sure to drop O_NONBLOCK */
3298         if (socket_fd >= 0)
3299                 (void) fd_nonblock(socket_fd, false);
3300
3301         /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
3302          * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
3303         if (params->cgroup_path) {
3304                 _cleanup_free_ char *p = NULL;
3305
3306                 r = exec_parameters_get_cgroup_path(params, &p);
3307                 if (r < 0) {
3308                         *exit_status = EXIT_CGROUP;
3309                         return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
3310                 }
3311
3312                 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
3313                 if (r < 0) {
3314                         *exit_status = EXIT_CGROUP;
3315                         return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
3316                 }
3317         }
3318
3319         if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
3320                 r = open_netns_path(runtime->netns_storage_socket, context->network_namespace_path);
3321                 if (r < 0) {
3322                         *exit_status = EXIT_NETWORK;
3323                         return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
3324                 }
3325         }
3326
3327         r = setup_input(context, params, socket_fd, named_iofds);
3328         if (r < 0) {
3329                 *exit_status = EXIT_STDIN;
3330                 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
3331         }
3332
3333         r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3334         if (r < 0) {
3335                 *exit_status = EXIT_STDOUT;
3336                 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
3337         }
3338
3339         r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3340         if (r < 0) {
3341                 *exit_status = EXIT_STDERR;
3342                 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
3343         }
3344
3345         if (context->oom_score_adjust_set) {
3346                 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
3347                  * prohibit write access to this file, and we shouldn't trip up over that. */
3348                 r = set_oom_score_adjust(context->oom_score_adjust);
3349                 if (IN_SET(r, -EPERM, -EACCES))
3350                         log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
3351                 else if (r < 0) {
3352                         *exit_status = EXIT_OOM_ADJUST;
3353                         return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
3354                 }
3355         }
3356
3357         if (context->coredump_filter_set) {
3358                 r = set_coredump_filter(context->coredump_filter);
3359                 if (ERRNO_IS_PRIVILEGE(r))
3360                         log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
3361                 else if (r < 0)
3362                         return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
3363         }
3364
3365         if (context->nice_set) {
3366                 r = setpriority_closest(context->nice);
3367                 if (r < 0)
3368                         return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
3369         }
3370
3371         if (context->cpu_sched_set) {
3372                 struct sched_param param = {
3373                         .sched_priority = context->cpu_sched_priority,
3374                 };
3375
3376                 r = sched_setscheduler(0,
3377                                        context->cpu_sched_policy |
3378                                        (context->cpu_sched_reset_on_fork ?
3379                                         SCHED_RESET_ON_FORK : 0),
3380                                        &param);
3381                 if (r < 0) {
3382                         *exit_status = EXIT_SETSCHEDULER;
3383                         return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
3384                 }
3385         }
3386
3387         if (context->cpu_affinity_from_numa || context->cpu_set.set) {
3388                 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
3389                 const CPUSet *cpu_set;
3390
3391                 if (context->cpu_affinity_from_numa) {
3392                         r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
3393                         if (r < 0) {
3394                                 *exit_status = EXIT_CPUAFFINITY;
3395                                 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
3396                         }
3397
3398                         cpu_set = &converted_cpu_set;
3399                 } else
3400                         cpu_set = &context->cpu_set;
3401
3402                 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
3403                         *exit_status = EXIT_CPUAFFINITY;
3404                         return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
3405                 }
3406         }
3407
3408         if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
3409                 r = apply_numa_policy(&context->numa_policy);
3410                 if (r == -EOPNOTSUPP)
3411                         log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
3412                 else if (r < 0) {
3413                         *exit_status = EXIT_NUMA_POLICY;
3414                         return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
3415                 }
3416         }
3417
3418         if (context->ioprio_set)
3419                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
3420                         *exit_status = EXIT_IOPRIO;
3421                         return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
3422                 }
3423
3424         if (context->timer_slack_nsec != NSEC_INFINITY)
3425                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
3426                         *exit_status = EXIT_TIMERSLACK;
3427                         return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
3428                 }
3429
3430         if (context->personality != PERSONALITY_INVALID) {
3431                 r = safe_personality(context->personality);
3432                 if (r < 0) {
3433                         *exit_status = EXIT_PERSONALITY;
3434                         return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
3435                 }
3436         }
3437
3438         if (context->utmp_id)
3439                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
3440                                       context->tty_path,
3441                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
3442                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
3443                                       USER_PROCESS,
3444                                       username);
3445
3446         if (uid_is_valid(uid)) {
3447                 r = chown_terminal(STDIN_FILENO, uid);
3448                 if (r < 0) {
3449                         *exit_status = EXIT_STDIN;
3450                         return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
3451                 }
3452         }
3453
3454         /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
3455          * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
3456          * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
3457          * touch a single hierarchy too. */
3458         if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
3459                 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
3460                 if (r < 0) {
3461                         *exit_status = EXIT_CGROUP;
3462                         return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
3463                 }
3464         }
3465
3466         for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3467                 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
3468                 if (r < 0)
3469                         return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
3470         }
3471
3472         r = build_environment(
3473                         unit,
3474                         context,
3475                         params,
3476                         n_fds,
3477                         home,
3478                         username,
3479                         shell,
3480                         journal_stream_dev,
3481                         journal_stream_ino,
3482                         &our_env);
3483         if (r < 0) {
3484                 *exit_status = EXIT_MEMORY;
3485                 return log_oom();
3486         }
3487
3488         r = build_pass_environment(context, &pass_env);
3489         if (r < 0) {
3490                 *exit_status = EXIT_MEMORY;
3491                 return log_oom();
3492         }
3493
3494         accum_env = strv_env_merge(5,
3495                                    params->environment,
3496                                    our_env,
3497                                    pass_env,
3498                                    context->environment,
3499                                    files_env);
3500         if (!accum_env) {
3501                 *exit_status = EXIT_MEMORY;
3502                 return log_oom();
3503         }
3504         accum_env = strv_env_clean(accum_env);
3505
3506         (void) umask(context->umask);
3507
3508         r = setup_keyring(unit, context, params, uid, gid);
3509         if (r < 0) {
3510                 *exit_status = EXIT_KEYRING;
3511                 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
3512         }
3513
3514         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
3515         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3516
3517         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
3518         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
3519
3520         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
3521         if (needs_ambient_hack)
3522                 needs_setuid = false;
3523         else
3524                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
3525
3526         if (needs_sandboxing) {
3527                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3528                  * present. The actual MAC context application will happen later, as late as possible, to avoid
3529                  * impacting our own code paths. */
3530
3531 #if HAVE_SELINUX
3532                 use_selinux = mac_selinux_use();
3533 #endif
3534 #if ENABLE_SMACK
3535                 use_smack = mac_smack_use();
3536 #endif
3537 #if HAVE_APPARMOR
3538                 use_apparmor = mac_apparmor_use();
3539 #endif
3540         }
3541
3542         if (needs_sandboxing) {
3543                 int which_failed;
3544
3545                 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
3546                  * is set here. (See below.) */
3547
3548                 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
3549                 if (r < 0) {
3550                         *exit_status = EXIT_LIMITS;
3551                         return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3552                 }
3553         }
3554
3555         if (needs_setuid) {
3556
3557                 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
3558                  * wins here. (See above.) */
3559
3560                 if (context->pam_name && username) {
3561                         r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
3562                         if (r < 0) {
3563                                 *exit_status = EXIT_PAM;
3564                                 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
3565                         }
3566
3567                         ngids_after_pam = getgroups_alloc(&gids_after_pam);
3568                         if (ngids_after_pam < 0) {
3569                                 *exit_status = EXIT_MEMORY;
3570                                 return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
3571                         }
3572                 }
3573         }
3574
3575         if (needs_sandboxing) {
3576 #if HAVE_SELINUX
3577                 if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
3578                         r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
3579                         if (r < 0) {
3580                                 *exit_status = EXIT_SELINUX_CONTEXT;
3581                                 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
3582                         }
3583                 }
3584 #endif
3585
3586                 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
3587                  * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
3588                  * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
3589                 if (context->private_users && !have_effective_cap(CAP_SYS_ADMIN)) {
3590                         userns_set_up = true;
3591                         r = setup_private_users(saved_uid, saved_gid, uid, gid);
3592                         if (r < 0) {
3593                                 *exit_status = EXIT_USER;
3594                                 return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
3595                         }
3596                 }
3597         }
3598
3599         if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
3600
3601                 if (ns_type_supported(NAMESPACE_NET)) {
3602                         r = setup_netns(runtime->netns_storage_socket);
3603                         if (r == -EPERM)
3604                                 log_unit_warning_errno(unit, r,
3605                                                        "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
3606                         else if (r < 0) {
3607                                 *exit_status = EXIT_NETWORK;
3608                                 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
3609                         }
3610                 } else if (context->network_namespace_path) {
3611                         *exit_status = EXIT_NETWORK;
3612                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
3613                                                     "NetworkNamespacePath= is not supported, refusing.");
3614                 } else
3615                         log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3616         }
3617
3618         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
3619         if (needs_mount_namespace) {
3620                 _cleanup_free_ char *error_path = NULL;
3621
3622                 r = apply_mount_namespace(unit, command, context, params, runtime, &error_path);
3623                 if (r < 0) {
3624                         *exit_status = EXIT_NAMESPACE;
3625                         return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
3626                                                     error_path ? ": " : "", strempty(error_path));
3627                 }
3628         }
3629
3630         if (needs_sandboxing) {
3631                 r = apply_protect_hostname(unit, context, exit_status);
3632                 if (r < 0)
3633                         return r;
3634         }
3635
3636         /* Drop groups as early as possible.
3637          * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
3638          * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
3639         if (needs_setuid) {
3640                 _cleanup_free_ gid_t *gids_to_enforce = NULL;
3641                 int ngids_to_enforce = 0;
3642
3643                 ngids_to_enforce = merge_gid_lists(supplementary_gids,
3644                                                    ngids,
3645                                                    gids_after_pam,
3646                                                    ngids_after_pam,
3647                                                    &gids_to_enforce);
3648                 if (ngids_to_enforce < 0) {
3649                         *exit_status = EXIT_MEMORY;
3650                         return log_unit_error_errno(unit,
3651                                                     ngids_to_enforce,
3652                                                     "Failed to merge group lists. Group membership might be incorrect: %m");
3653                 }
3654
3655                 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
3656                 if (r < 0) {
3657                         *exit_status = EXIT_GROUP;
3658                         return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
3659                 }
3660         }
3661
3662         /* If the user namespace was not set up above, try to do it now.
3663          * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
3664          * restricted by rules pertaining to combining user namspaces with other namespaces (e.g. in the
3665          * case of mount namespaces being less privileged when the mount point list is copied from a
3666          * different user namespace). */
3667
3668         if (needs_sandboxing && context->private_users && !userns_set_up) {
3669                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
3670                 if (r < 0) {
3671                         *exit_status = EXIT_USER;
3672                         return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
3673                 }
3674         }
3675
3676         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3677          * more aggressive this time since socket_fd and the netns fds we don't need anymore. We do keep the exec_fd
3678          * however if we have it as we want to keep it open until the final execve(). */
3679
3680         if (params->exec_fd >= 0) {
3681                 exec_fd = params->exec_fd;
3682
3683                 if (exec_fd < 3 + (int) n_fds) {
3684                         int moved_fd;
3685
3686                         /* Let's move the exec fd far up, so that it's outside of the fd range we want to pass to the
3687                          * process we are about to execute. */
3688
3689                         moved_fd = fcntl(exec_fd, F_DUPFD_CLOEXEC, 3 + (int) n_fds);
3690                         if (moved_fd < 0) {
3691                                 *exit_status = EXIT_FDS;
3692                                 return log_unit_error_errno(unit, errno, "Couldn't move exec fd up: %m");
3693                         }
3694
3695                         safe_close(exec_fd);
3696                         exec_fd = moved_fd;
3697                 } else {
3698                         /* This fd should be FD_CLOEXEC already, but let's make sure. */
3699                         r = fd_cloexec(exec_fd, true);
3700                         if (r < 0) {
3701                                 *exit_status = EXIT_FDS;
3702                                 return log_unit_error_errno(unit, r, "Failed to make exec fd FD_CLOEXEC: %m");
3703                         }
3704                 }
3705
3706                 fds_with_exec_fd = newa(int, n_fds + 1);
3707                 memcpy_safe(fds_with_exec_fd, fds, n_fds * sizeof(int));
3708                 fds_with_exec_fd[n_fds] = exec_fd;
3709                 n_fds_with_exec_fd = n_fds + 1;
3710         } else {
3711                 fds_with_exec_fd = fds;
3712                 n_fds_with_exec_fd = n_fds;
3713         }
3714
3715         r = close_all_fds(fds_with_exec_fd, n_fds_with_exec_fd);
3716         if (r >= 0)
3717                 r = shift_fds(fds, n_fds);
3718         if (r >= 0)
3719                 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
3720         if (r < 0) {
3721                 *exit_status = EXIT_FDS;
3722                 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
3723         }
3724
3725         /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
3726          * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
3727          * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
3728          * came this far. */
3729
3730         secure_bits = context->secure_bits;
3731
3732         if (needs_sandboxing) {
3733                 uint64_t bset;
3734
3735                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
3736                  * requested. (Note this is placed after the general resource limit initialization, see
3737                  * above, in order to take precedence.) */
3738                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
3739                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
3740                                 *exit_status = EXIT_LIMITS;
3741                                 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3742                         }
3743                 }
3744
3745 #if ENABLE_SMACK
3746                 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
3747                  * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
3748                 if (use_smack) {
3749                         r = setup_smack(context, command);
3750                         if (r < 0) {
3751                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
3752                                 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
3753                         }
3754                 }
3755 #endif
3756
3757                 bset = context->capability_bounding_set;
3758                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3759                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3760                  * instead of us doing that */
3761                 if (needs_ambient_hack)
3762                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
3763                                 (UINT64_C(1) << CAP_SETUID) |
3764                                 (UINT64_C(1) << CAP_SETGID);
3765
3766                 if (!cap_test_all(bset)) {
3767                         r = capability_bounding_set_drop(bset, false);
3768                         if (r < 0) {
3769                                 *exit_status = EXIT_CAPABILITIES;
3770                                 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3771                         }
3772                 }
3773
3774                 /* This is done before enforce_user, but ambient set
3775                  * does not survive over setresuid() if keep_caps is not set. */
3776                 if (!needs_ambient_hack) {
3777                         r = capability_ambient_set_apply(context->capability_ambient_set, true);
3778                         if (r < 0) {
3779                                 *exit_status = EXIT_CAPABILITIES;
3780                                 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
3781                         }
3782                 }
3783         }
3784
3785         /* chroot to root directory first, before we lose the ability to chroot */
3786         r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
3787         if (r < 0)
3788                 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
3789
3790         if (needs_setuid) {
3791                 if (uid_is_valid(uid)) {
3792                         r = enforce_user(context, uid);
3793                         if (r < 0) {
3794                                 *exit_status = EXIT_USER;
3795                                 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
3796                         }
3797
3798                         if (!needs_ambient_hack &&
3799                             context->capability_ambient_set != 0) {
3800
3801                                 /* Fix the ambient capabilities after user change. */
3802                                 r = capability_ambient_set_apply(context->capability_ambient_set, false);
3803                                 if (r < 0) {
3804                                         *exit_status = EXIT_CAPABILITIES;
3805                                         return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
3806                                 }
3807
3808                                 /* If we were asked to change user and ambient capabilities
3809                                  * were requested, we had to add keep-caps to the securebits
3810                                  * so that we would maintain the inherited capability set
3811                                  * through the setresuid(). Make sure that the bit is added
3812                                  * also to the context secure_bits so that we don't try to
3813                                  * drop the bit away next. */
3814
3815                                 secure_bits |= 1<<SECURE_KEEP_CAPS;
3816                         }
3817                 }
3818         }
3819
3820         /* Apply working directory here, because the working directory might be on NFS and only the user running
3821          * this service might have the correct privilege to change to the working directory */
3822         r = apply_working_directory(context, params, home, exit_status);
3823         if (r < 0)
3824                 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
3825
3826         if (needs_sandboxing) {
3827                 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3828                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3829                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3830                  * are restricted. */
3831
3832 #if HAVE_SELINUX
3833                 if (use_selinux) {
3834                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
3835
3836                         if (exec_context) {
3837                                 r = setexeccon(exec_context);
3838                                 if (r < 0) {
3839                                         *exit_status = EXIT_SELINUX_CONTEXT;
3840                                         return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
3841                                 }
3842                         }
3843                 }
3844 #endif
3845
3846 #if HAVE_APPARMOR
3847                 if (use_apparmor && context->apparmor_profile) {
3848                         r = aa_change_onexec(context->apparmor_profile);
3849                         if (r < 0 && !context->apparmor_profile_ignore) {
3850                                 *exit_status = EXIT_APPARMOR_PROFILE;
3851                                 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
3852                         }
3853                 }
3854 #endif
3855
3856                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3857                  * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3858                 if (prctl(PR_GET_SECUREBITS) != secure_bits)
3859                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
3860                                 *exit_status = EXIT_SECUREBITS;
3861                                 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
3862                         }
3863
3864                 if (context_has_no_new_privileges(context))
3865                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
3866                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
3867                                 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
3868                         }
3869
3870 #if HAVE_SECCOMP
3871                 r = apply_address_families(unit, context);
3872                 if (r < 0) {
3873                         *exit_status = EXIT_ADDRESS_FAMILIES;
3874                         return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
3875                 }
3876
3877                 r = apply_memory_deny_write_execute(unit, context);
3878                 if (r < 0) {
3879                         *exit_status = EXIT_SECCOMP;
3880                         return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
3881                 }
3882
3883                 r = apply_restrict_realtime(unit, context);
3884                 if (r < 0) {
3885                         *exit_status = EXIT_SECCOMP;
3886                         return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
3887                 }
3888
3889                 r = apply_restrict_suid_sgid(unit, context);
3890                 if (r < 0) {
3891                         *exit_status = EXIT_SECCOMP;
3892                         return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
3893                 }
3894
3895                 r = apply_restrict_namespaces(unit, context);
3896                 if (r < 0) {
3897                         *exit_status = EXIT_SECCOMP;
3898                         return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
3899                 }
3900
3901                 r = apply_protect_sysctl(unit, context);
3902                 if (r < 0) {
3903                         *exit_status = EXIT_SECCOMP;
3904                         return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
3905                 }
3906
3907                 r = apply_protect_kernel_modules(unit, context);
3908                 if (r < 0) {
3909                         *exit_status = EXIT_SECCOMP;
3910                         return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
3911                 }
3912
3913                 r = apply_protect_kernel_logs(unit, context);
3914                 if (r < 0) {
3915                         *exit_status = EXIT_SECCOMP;
3916                         return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
3917                 }
3918
3919                 r = apply_protect_clock(unit, context);
3920                 if (r < 0) {
3921                         *exit_status = EXIT_SECCOMP;
3922                         return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
3923                 }
3924
3925                 r = apply_private_devices(unit, context);
3926                 if (r < 0) {
3927                         *exit_status = EXIT_SECCOMP;
3928                         return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
3929                 }
3930
3931                 r = apply_syscall_archs(unit, context);
3932                 if (r < 0) {
3933                         *exit_status = EXIT_SECCOMP;
3934                         return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
3935                 }
3936
3937                 r = apply_lock_personality(unit, context);
3938                 if (r < 0) {
3939                         *exit_status = EXIT_SECCOMP;
3940                         return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
3941                 }
3942
3943                 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3944                  * by the filter as little as possible. */
3945                 r = apply_syscall_filter(unit, context, needs_ambient_hack);
3946                 if (r < 0) {
3947                         *exit_status = EXIT_SECCOMP;
3948                         return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
3949                 }
3950 #endif
3951         }
3952
3953         if (!strv_isempty(context->unset_environment)) {
3954                 char **ee = NULL;
3955
3956                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
3957                 if (!ee) {
3958                         *exit_status = EXIT_MEMORY;
3959                         return log_oom();
3960                 }
3961
3962                 strv_free_and_replace(accum_env, ee);
3963         }
3964
3965         if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
3966                 replaced_argv = replace_env_argv(command->argv, accum_env);
3967                 if (!replaced_argv) {
3968                         *exit_status = EXIT_MEMORY;
3969                         return log_oom();
3970                 }
3971                 final_argv = replaced_argv;
3972         } else
3973                 final_argv = command->argv;
3974
3975         if (DEBUG_LOGGING) {
3976                 _cleanup_free_ char *line;
3977
3978                 line = exec_command_line(final_argv);
3979                 if (line)
3980                         log_struct(LOG_DEBUG,
3981                                    "EXECUTABLE=%s", command->path,
3982                                    LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
3983                                    LOG_UNIT_ID(unit),
3984                                    LOG_UNIT_INVOCATION_ID(unit));
3985         }
3986
3987         if (exec_fd >= 0) {
3988                 uint8_t hot = 1;
3989
3990                 /* We have finished with all our initializations. Let's now let the manager know that. From this point
3991                  * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
3992
3993                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
3994                         *exit_status = EXIT_EXEC;
3995                         return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
3996                 }
3997         }
3998
3999         execve(command->path, final_argv, accum_env);
4000         r = -errno;
4001
4002         if (exec_fd >= 0) {
4003                 uint8_t hot = 0;
4004
4005                 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
4006                  * that POLLHUP on it no longer means execve() succeeded. */
4007
4008                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
4009                         *exit_status = EXIT_EXEC;
4010                         return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
4011                 }
4012         }
4013
4014         if (r == -ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
4015                 log_struct_errno(LOG_INFO, r,
4016                                  "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4017                                  LOG_UNIT_ID(unit),
4018                                  LOG_UNIT_INVOCATION_ID(unit),
4019                                  LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
4020                                                   command->path),
4021                                  "EXECUTABLE=%s", command->path);
4022                 return 0;
4023         }
4024
4025         *exit_status = EXIT_EXEC;
4026         return log_unit_error_errno(unit, r, "Failed to execute command: %m");
4027 }
4028
4029 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
4030 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
4031
4032 int exec_spawn(Unit *unit,
4033                ExecCommand *command,
4034                const ExecContext *context,
4035                const ExecParameters *params,
4036                ExecRuntime *runtime,
4037                DynamicCreds *dcreds,
4038                pid_t *ret) {
4039
4040         int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
4041         _cleanup_free_ char *subcgroup_path = NULL;
4042         _cleanup_strv_free_ char **files_env = NULL;
4043         size_t n_storage_fds = 0, n_socket_fds = 0;
4044         _cleanup_free_ char *line = NULL;
4045         pid_t pid;
4046
4047         assert(unit);
4048         assert(command);
4049         assert(context);
4050         assert(ret);
4051         assert(params);
4052         assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
4053
4054         if (context->std_input == EXEC_INPUT_SOCKET ||
4055             context->std_output == EXEC_OUTPUT_SOCKET ||
4056             context->std_error == EXEC_OUTPUT_SOCKET) {
4057
4058                 if (params->n_socket_fds > 1) {
4059                         log_unit_error(unit, "Got more than one socket.");
4060                         return -EINVAL;
4061                 }
4062
4063                 if (params->n_socket_fds == 0) {
4064                         log_unit_error(unit, "Got no socket.");
4065                         return -EINVAL;
4066                 }
4067
4068                 socket_fd = params->fds[0];
4069         } else {
4070                 socket_fd = -1;
4071                 fds = params->fds;
4072                 n_socket_fds = params->n_socket_fds;
4073                 n_storage_fds = params->n_storage_fds;
4074         }
4075
4076         r = exec_context_named_iofds(context, params, named_iofds);
4077         if (r < 0)
4078                 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
4079
4080         r = exec_context_load_environment(unit, context, &files_env);
4081         if (r < 0)
4082                 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
4083
4084         line = exec_command_line(command->argv);
4085         if (!line)
4086                 return log_oom();
4087
4088         log_struct(LOG_DEBUG,
4089                    LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
4090                    "EXECUTABLE=%s", command->path,
4091                    LOG_UNIT_ID(unit),
4092                    LOG_UNIT_INVOCATION_ID(unit));
4093
4094         if (params->cgroup_path) {
4095                 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
4096                 if (r < 0)
4097                         return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
4098                 if (r > 0) { /* We are using a child cgroup */
4099                         r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
4100                         if (r < 0)
4101                                 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
4102                 }
4103         }
4104
4105         pid = fork();
4106         if (pid < 0)
4107                 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
4108
4109         if (pid == 0) {
4110                 int exit_status = EXIT_SUCCESS;
4111
4112                 r = exec_child(unit,
4113                                command,
4114                                context,
4115                                params,
4116                                runtime,
4117                                dcreds,
4118                                socket_fd,
4119                                named_iofds,
4120                                fds,
4121                                n_socket_fds,
4122                                n_storage_fds,
4123                                files_env,
4124                                unit->manager->user_lookup_fds[1],
4125                                &exit_status);
4126
4127                 if (r < 0) {
4128                         const char *status =
4129                                 exit_status_to_string(exit_status,
4130                                                       EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
4131
4132                         log_struct_errno(LOG_ERR, r,
4133                                          "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4134                                          LOG_UNIT_ID(unit),
4135                                          LOG_UNIT_INVOCATION_ID(unit),
4136                                          LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
4137                                                           status, command->path),
4138                                          "EXECUTABLE=%s", command->path);
4139                 }
4140
4141                 _exit(exit_status);
4142         }
4143
4144         log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
4145
4146         /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
4147          * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
4148          * process will be killed too). */
4149         if (subcgroup_path)
4150                 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
4151
4152         exec_status_start(&command->exec_status, pid);
4153
4154         *ret = pid;
4155         return 0;
4156 }
4157
4158 void exec_context_init(ExecContext *c) {
4159         ExecDirectoryType i;
4160
4161         assert(c);
4162
4163         c->umask = 0022;
4164         c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
4165         c->cpu_sched_policy = SCHED_OTHER;
4166         c->syslog_priority = LOG_DAEMON|LOG_INFO;
4167         c->syslog_level_prefix = true;
4168         c->ignore_sigpipe = true;
4169         c->timer_slack_nsec = NSEC_INFINITY;
4170         c->personality = PERSONALITY_INVALID;
4171         for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
4172                 c->directories[i].mode = 0755;
4173         c->timeout_clean_usec = USEC_INFINITY;
4174         c->capability_bounding_set = CAP_ALL;
4175         assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
4176         c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
4177         c->log_level_max = -1;
4178         numa_policy_reset(&c->numa_policy);
4179 }
4180
4181 void exec_context_done(ExecContext *c) {
4182         ExecDirectoryType i;
4183         size_t l;
4184
4185         assert(c);
4186
4187         c->environment = strv_free(c->environment);
4188         c->environment_files = strv_free(c->environment_files);
4189         c->pass_environment = strv_free(c->pass_environment);
4190         c->unset_environment = strv_free(c->unset_environment);
4191
4192         rlimit_free_all(c->rlimit);
4193
4194         for (l = 0; l < 3; l++) {
4195                 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
4196                 c->stdio_file[l] = mfree(c->stdio_file[l]);
4197         }
4198
4199         c->working_directory = mfree(c->working_directory);
4200         c->root_directory = mfree(c->root_directory);
4201         c->root_image = mfree(c->root_image);
4202         c->root_hash = mfree(c->root_hash);
4203         c->root_hash_size = 0;
4204         c->root_hash_path = mfree(c->root_hash_path);
4205         c->root_hash_sig = mfree(c->root_hash_sig);
4206         c->root_hash_sig_size = 0;
4207         c->root_hash_sig_path = mfree(c->root_hash_sig_path);
4208         c->root_verity = mfree(c->root_verity);
4209         c->tty_path = mfree(c->tty_path);
4210         c->syslog_identifier = mfree(c->syslog_identifier);
4211         c->user = mfree(c->user);
4212         c->group = mfree(c->group);
4213
4214         c->supplementary_groups = strv_free(c->supplementary_groups);
4215
4216         c->pam_name = mfree(c->pam_name);
4217
4218         c->read_only_paths = strv_free(c->read_only_paths);
4219         c->read_write_paths = strv_free(c->read_write_paths);
4220         c->inaccessible_paths = strv_free(c->inaccessible_paths);
4221
4222         bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
4223         c->bind_mounts = NULL;
4224         c->n_bind_mounts = 0;
4225         temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
4226         c->temporary_filesystems = NULL;
4227         c->n_temporary_filesystems = 0;
4228
4229         cpu_set_reset(&c->cpu_set);
4230         numa_policy_reset(&c->numa_policy);
4231
4232         c->utmp_id = mfree(c->utmp_id);
4233         c->selinux_context = mfree(c->selinux_context);
4234         c->apparmor_profile = mfree(c->apparmor_profile);
4235         c->smack_process_label = mfree(c->smack_process_label);
4236
4237         c->syscall_filter = hashmap_free(c->syscall_filter);
4238         c->syscall_archs = set_free(c->syscall_archs);
4239         c->address_families = set_free(c->address_families);
4240
4241         for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
4242                 c->directories[i].paths = strv_free(c->directories[i].paths);
4243
4244         c->log_level_max = -1;
4245
4246         exec_context_free_log_extra_fields(c);
4247
4248         c->log_ratelimit_interval_usec = 0;
4249         c->log_ratelimit_burst = 0;
4250
4251         c->stdin_data = mfree(c->stdin_data);
4252         c->stdin_data_size = 0;
4253
4254         c->network_namespace_path = mfree(c->network_namespace_path);
4255
4256         c->log_namespace = mfree(c->log_namespace);
4257 }
4258
4259 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
4260         char **i;
4261
4262         assert(c);
4263
4264         if (!runtime_prefix)
4265                 return 0;
4266
4267         STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
4268                 _cleanup_free_ char *p;
4269
4270                 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
4271                         p = path_join(runtime_prefix, "private", *i);
4272                 else
4273                         p = path_join(runtime_prefix, *i);
4274                 if (!p)
4275                         return -ENOMEM;
4276
4277                 /* We execute this synchronously, since we need to be sure this is gone when we start the
4278                  * service next. */
4279                 (void) rm_rf(p, REMOVE_ROOT);
4280         }
4281
4282         return 0;
4283 }
4284
4285 static void exec_command_done(ExecCommand *c) {
4286         assert(c);
4287
4288         c->path = mfree(c->path);
4289         c->argv = strv_free(c->argv);
4290 }
4291
4292 void exec_command_done_array(ExecCommand *c, size_t n) {
4293         size_t i;
4294
4295         for (i = 0; i < n; i++)
4296                 exec_command_done(c+i);
4297 }
4298
4299 ExecCommand* exec_command_free_list(ExecCommand *c) {
4300         ExecCommand *i;
4301
4302         while ((i = c)) {
4303                 LIST_REMOVE(command, c, i);
4304                 exec_command_done(i);
4305                 free(i);
4306         }
4307
4308         return NULL;
4309 }
4310
4311 void exec_command_free_array(ExecCommand **c, size_t n) {
4312         size_t i;
4313
4314         for (i = 0; i < n; i++)
4315                 c[i] = exec_command_free_list(c[i]);
4316 }
4317
4318 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
4319         size_t i;
4320
4321         for (i = 0; i < n; i++)
4322                 exec_status_reset(&c[i].exec_status);
4323 }
4324
4325 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
4326         size_t i;
4327
4328         for (i = 0; i < n; i++) {
4329                 ExecCommand *z;
4330
4331                 LIST_FOREACH(command, z, c[i])
4332                         exec_status_reset(&z->exec_status);
4333         }
4334 }
4335
4336 typedef struct InvalidEnvInfo {
4337         const Unit *unit;
4338         const char *path;
4339 } InvalidEnvInfo;
4340
4341 static void invalid_env(const char *p, void *userdata) {
4342         InvalidEnvInfo *info = userdata;
4343
4344         log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
4345 }
4346
4347 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
4348         assert(c);
4349
4350         switch (fd_index) {
4351
4352         case STDIN_FILENO:
4353                 if (c->std_input != EXEC_INPUT_NAMED_FD)
4354                         return NULL;
4355
4356                 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
4357
4358         case STDOUT_FILENO:
4359                 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
4360                         return NULL;
4361
4362                 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
4363
4364         case STDERR_FILENO:
4365                 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
4366                         return NULL;
4367
4368                 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
4369
4370         default:
4371                 return NULL;
4372         }
4373 }
4374
4375 static int exec_context_named_iofds(
4376                 const ExecContext *c,
4377                 const ExecParameters *p,
4378                 int named_iofds[static 3]) {
4379
4380         size_t i, targets;
4381         const char* stdio_fdname[3];
4382         size_t n_fds;
4383
4384         assert(c);
4385         assert(p);
4386         assert(named_iofds);
4387
4388         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
4389                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
4390                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
4391
4392         for (i = 0; i < 3; i++)
4393                 stdio_fdname[i] = exec_context_fdname(c, i);
4394
4395         n_fds = p->n_storage_fds + p->n_socket_fds;
4396
4397         for (i = 0; i < n_fds  && targets > 0; i++)
4398                 if (named_iofds[STDIN_FILENO] < 0 &&
4399                     c->std_input == EXEC_INPUT_NAMED_FD &&
4400                     stdio_fdname[STDIN_FILENO] &&
4401                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
4402
4403                         named_iofds[STDIN_FILENO] = p->fds[i];
4404                         targets--;
4405
4406                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
4407                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
4408                            stdio_fdname[STDOUT_FILENO] &&
4409                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
4410
4411                         named_iofds[STDOUT_FILENO] = p->fds[i];
4412                         targets--;
4413
4414                 } else if (named_iofds[STDERR_FILENO] < 0 &&
4415                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
4416                            stdio_fdname[STDERR_FILENO] &&
4417                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
4418
4419                         named_iofds[STDERR_FILENO] = p->fds[i];
4420                         targets--;
4421                 }
4422
4423         return targets == 0 ? 0 : -ENOENT;
4424 }
4425
4426 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
4427         char **i, **r = NULL;
4428
4429         assert(c);
4430         assert(l);
4431
4432         STRV_FOREACH(i, c->environment_files) {
4433                 char *fn;
4434                 int k;
4435                 unsigned n;
4436                 bool ignore = false;
4437                 char **p;
4438                 _cleanup_globfree_ glob_t pglob = {};
4439
4440                 fn = *i;
4441
4442                 if (fn[0] == '-') {
4443                         ignore = true;
4444                         fn++;
4445                 }
4446
4447                 if (!path_is_absolute(fn)) {
4448                         if (ignore)
4449                                 continue;
4450
4451                         strv_free(r);
4452                         return -EINVAL;
4453                 }
4454
4455                 /* Filename supports globbing, take all matching files */
4456                 k = safe_glob(fn, 0, &pglob);
4457                 if (k < 0) {
4458                         if (ignore)
4459                                 continue;
4460
4461                         strv_free(r);
4462                         return k;
4463                 }
4464
4465                 /* When we don't match anything, -ENOENT should be returned */
4466                 assert(pglob.gl_pathc > 0);
4467
4468                 for (n = 0; n < pglob.gl_pathc; n++) {
4469                         k = load_env_file(NULL, pglob.gl_pathv[n], &p);
4470                         if (k < 0) {
4471                                 if (ignore)
4472                                         continue;
4473
4474                                 strv_free(r);
4475                                 return k;
4476                         }
4477                         /* Log invalid environment variables with filename */
4478                         if (p) {
4479                                 InvalidEnvInfo info = {
4480                                         .unit = unit,
4481                                         .path = pglob.gl_pathv[n]
4482                                 };
4483
4484                                 p = strv_env_clean_with_callback(p, invalid_env, &info);
4485                         }
4486
4487                         if (!r)
4488                                 r = p;
4489                         else {
4490                                 char **m;
4491
4492                                 m = strv_env_merge(2, r, p);
4493                                 strv_free(r);
4494                                 strv_free(p);
4495                                 if (!m)
4496                                         return -ENOMEM;
4497
4498                                 r = m;
4499                         }
4500                 }
4501         }
4502
4503         *l = r;
4504
4505         return 0;
4506 }
4507
4508 static bool tty_may_match_dev_console(const char *tty) {
4509         _cleanup_free_ char *resolved = NULL;
4510
4511         if (!tty)
4512                 return true;
4513
4514         tty = skip_dev_prefix(tty);
4515
4516         /* trivial identity? */
4517         if (streq(tty, "console"))
4518                 return true;
4519
4520         if (resolve_dev_console(&resolved) < 0)
4521                 return true; /* if we could not resolve, assume it may */
4522
4523         /* "tty0" means the active VC, so it may be the same sometimes */
4524         return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
4525 }
4526
4527 static bool exec_context_may_touch_tty(const ExecContext *ec) {
4528         assert(ec);
4529
4530         return ec->tty_reset ||
4531                 ec->tty_vhangup ||
4532                 ec->tty_vt_disallocate ||
4533                 is_terminal_input(ec->std_input) ||
4534                 is_terminal_output(ec->std_output) ||
4535                 is_terminal_output(ec->std_error);
4536 }
4537
4538 bool exec_context_may_touch_console(const ExecContext *ec) {
4539
4540         return exec_context_may_touch_tty(ec) &&
4541                tty_may_match_dev_console(exec_context_tty_path(ec));
4542 }
4543
4544 static void strv_fprintf(FILE *f, char **l) {
4545         char **g;
4546
4547         assert(f);
4548
4549         STRV_FOREACH(g, l)
4550                 fprintf(f, " %s", *g);
4551 }
4552
4553 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
4554         char **e, **d, buf_clean[FORMAT_TIMESPAN_MAX];
4555         ExecDirectoryType dt;
4556         unsigned i;
4557         int r;
4558
4559         assert(c);
4560         assert(f);
4561
4562         prefix = strempty(prefix);
4563
4564         fprintf(f,
4565                 "%sUMask: %04o\n"
4566                 "%sWorkingDirectory: %s\n"
4567                 "%sRootDirectory: %s\n"
4568                 "%sNonBlocking: %s\n"
4569                 "%sPrivateTmp: %s\n"
4570                 "%sPrivateDevices: %s\n"
4571                 "%sProtectKernelTunables: %s\n"
4572                 "%sProtectKernelModules: %s\n"
4573                 "%sProtectKernelLogs: %s\n"
4574                 "%sProtectClock: %s\n"
4575                 "%sProtectControlGroups: %s\n"
4576                 "%sPrivateNetwork: %s\n"
4577                 "%sPrivateUsers: %s\n"
4578                 "%sProtectHome: %s\n"
4579                 "%sProtectSystem: %s\n"
4580                 "%sMountAPIVFS: %s\n"
4581                 "%sIgnoreSIGPIPE: %s\n"
4582                 "%sMemoryDenyWriteExecute: %s\n"
4583                 "%sRestrictRealtime: %s\n"
4584                 "%sRestrictSUIDSGID: %s\n"
4585                 "%sKeyringMode: %s\n"
4586                 "%sProtectHostname: %s\n",
4587                 prefix, c->umask,
4588                 prefix, c->working_directory ? c->working_directory : "/",
4589                 prefix, c->root_directory ? c->root_directory : "/",
4590                 prefix, yes_no(c->non_blocking),
4591                 prefix, yes_no(c->private_tmp),
4592                 prefix, yes_no(c->private_devices),
4593                 prefix, yes_no(c->protect_kernel_tunables),
4594                 prefix, yes_no(c->protect_kernel_modules),
4595                 prefix, yes_no(c->protect_kernel_logs),
4596                 prefix, yes_no(c->protect_clock),
4597                 prefix, yes_no(c->protect_control_groups),
4598                 prefix, yes_no(c->private_network),
4599                 prefix, yes_no(c->private_users),
4600                 prefix, protect_home_to_string(c->protect_home),
4601                 prefix, protect_system_to_string(c->protect_system),
4602                 prefix, yes_no(c->mount_apivfs),
4603                 prefix, yes_no(c->ignore_sigpipe),
4604                 prefix, yes_no(c->memory_deny_write_execute),
4605                 prefix, yes_no(c->restrict_realtime),
4606                 prefix, yes_no(c->restrict_suid_sgid),
4607                 prefix, exec_keyring_mode_to_string(c->keyring_mode),
4608                 prefix, yes_no(c->protect_hostname));
4609
4610         if (c->root_image)
4611                 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
4612
4613         if (c->root_hash) {
4614                 _cleanup_free_ char *encoded = NULL;
4615                 encoded = hexmem(c->root_hash, c->root_hash_size);
4616                 if (encoded)
4617                         fprintf(f, "%sRootHash: %s\n", prefix, encoded);
4618         }
4619
4620         if (c->root_hash_path)
4621                 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
4622
4623         if (c->root_hash_sig) {
4624                 _cleanup_free_ char *encoded = NULL;
4625                 ssize_t len;
4626                 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
4627                 if (len)
4628                         fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
4629         }
4630
4631         if (c->root_hash_sig_path)
4632                 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
4633
4634         if (c->root_verity)
4635                 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
4636
4637         STRV_FOREACH(e, c->environment)
4638                 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
4639
4640         STRV_FOREACH(e, c->environment_files)
4641                 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
4642
4643         STRV_FOREACH(e, c->pass_environment)
4644                 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
4645
4646         STRV_FOREACH(e, c->unset_environment)
4647                 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
4648
4649         fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
4650
4651         for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4652                 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
4653
4654                 STRV_FOREACH(d, c->directories[dt].paths)
4655                         fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
4656         }
4657
4658         fprintf(f,
4659                 "%sTimeoutCleanSec: %s\n",
4660                 prefix, format_timespan(buf_clean, sizeof(buf_clean), c->timeout_clean_usec, USEC_PER_SEC));
4661
4662         if (c->nice_set)
4663                 fprintf(f,
4664                         "%sNice: %i\n",
4665                         prefix, c->nice);
4666
4667         if (c->oom_score_adjust_set)
4668                 fprintf(f,
4669                         "%sOOMScoreAdjust: %i\n",
4670                         prefix, c->oom_score_adjust);
4671
4672         if (c->coredump_filter_set)
4673                 fprintf(f,
4674                         "%sCoredumpFilter: 0x%"PRIx64"\n",
4675                         prefix, c->coredump_filter);
4676
4677         for (i = 0; i < RLIM_NLIMITS; i++)
4678                 if (c->rlimit[i]) {
4679                         fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
4680                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
4681                         fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
4682                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
4683                 }
4684
4685         if (c->ioprio_set) {
4686                 _cleanup_free_ char *class_str = NULL;
4687
4688                 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
4689                 if (r >= 0)
4690                         fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
4691
4692                 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
4693         }
4694
4695         if (c->cpu_sched_set) {
4696                 _cleanup_free_ char *policy_str = NULL;
4697
4698                 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
4699                 if (r >= 0)
4700                         fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
4701
4702                 fprintf(f,
4703                         "%sCPUSchedulingPriority: %i\n"
4704                         "%sCPUSchedulingResetOnFork: %s\n",
4705                         prefix, c->cpu_sched_priority,
4706                         prefix, yes_no(c->cpu_sched_reset_on_fork));
4707         }
4708
4709         if (c->cpu_set.set) {
4710                 _cleanup_free_ char *affinity = NULL;
4711
4712                 affinity = cpu_set_to_range_string(&c->cpu_set);
4713                 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
4714         }
4715
4716         if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
4717                 _cleanup_free_ char *nodes = NULL;
4718
4719                 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
4720                 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
4721                 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
4722         }
4723
4724         if (c->timer_slack_nsec != NSEC_INFINITY)
4725                 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
4726
4727         fprintf(f,
4728                 "%sStandardInput: %s\n"
4729                 "%sStandardOutput: %s\n"
4730                 "%sStandardError: %s\n",
4731                 prefix, exec_input_to_string(c->std_input),
4732                 prefix, exec_output_to_string(c->std_output),
4733                 prefix, exec_output_to_string(c->std_error));
4734
4735         if (c->std_input == EXEC_INPUT_NAMED_FD)
4736                 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
4737         if (c->std_output == EXEC_OUTPUT_NAMED_FD)
4738                 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
4739         if (c->std_error == EXEC_OUTPUT_NAMED_FD)
4740                 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
4741
4742         if (c->std_input == EXEC_INPUT_FILE)
4743                 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
4744         if (c->std_output == EXEC_OUTPUT_FILE)
4745                 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4746         if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
4747                 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4748         if (c->std_error == EXEC_OUTPUT_FILE)
4749                 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4750         if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
4751                 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4752
4753         if (c->tty_path)
4754                 fprintf(f,
4755                         "%sTTYPath: %s\n"
4756                         "%sTTYReset: %s\n"
4757                         "%sTTYVHangup: %s\n"
4758                         "%sTTYVTDisallocate: %s\n",
4759                         prefix, c->tty_path,
4760                         prefix, yes_no(c->tty_reset),
4761                         prefix, yes_no(c->tty_vhangup),
4762                         prefix, yes_no(c->tty_vt_disallocate));
4763
4764         if (IN_SET(c->std_output,
4765                    EXEC_OUTPUT_KMSG,
4766                    EXEC_OUTPUT_JOURNAL,
4767                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
4768                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
4769             IN_SET(c->std_error,
4770                    EXEC_OUTPUT_KMSG,
4771                    EXEC_OUTPUT_JOURNAL,
4772                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
4773                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
4774
4775                 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
4776
4777                 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
4778                 if (r >= 0)
4779                         fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
4780
4781                 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
4782                 if (r >= 0)
4783                         fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
4784         }
4785
4786         if (c->log_level_max >= 0) {
4787                 _cleanup_free_ char *t = NULL;
4788
4789                 (void) log_level_to_string_alloc(c->log_level_max, &t);
4790
4791                 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
4792         }
4793
4794         if (c->log_ratelimit_interval_usec > 0) {
4795                 char buf_timespan[FORMAT_TIMESPAN_MAX];
4796
4797                 fprintf(f,
4798                         "%sLogRateLimitIntervalSec: %s\n",
4799                         prefix, format_timespan(buf_timespan, sizeof(buf_timespan), c->log_ratelimit_interval_usec, USEC_PER_SEC));
4800         }
4801
4802         if (c->log_ratelimit_burst > 0)
4803                 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
4804
4805         if (c->n_log_extra_fields > 0) {
4806                 size_t j;
4807
4808                 for (j = 0; j < c->n_log_extra_fields; j++) {
4809                         fprintf(f, "%sLogExtraFields: ", prefix);
4810                         fwrite(c->log_extra_fields[j].iov_base,
4811                                1, c->log_extra_fields[j].iov_len,
4812                                f);
4813                         fputc('\n', f);
4814                 }
4815         }
4816
4817         if (c->log_namespace)
4818                 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
4819
4820         if (c->secure_bits) {
4821                 _cleanup_free_ char *str = NULL;
4822
4823                 r = secure_bits_to_string_alloc(c->secure_bits, &str);
4824                 if (r >= 0)
4825                         fprintf(f, "%sSecure Bits: %s\n", prefix, str);
4826         }
4827
4828         if (c->capability_bounding_set != CAP_ALL) {
4829                 _cleanup_free_ char *str = NULL;
4830
4831                 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
4832                 if (r >= 0)
4833                         fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
4834         }
4835
4836         if (c->capability_ambient_set != 0) {
4837                 _cleanup_free_ char *str = NULL;
4838
4839                 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
4840                 if (r >= 0)
4841                         fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
4842         }
4843
4844         if (c->user)
4845                 fprintf(f, "%sUser: %s\n", prefix, c->user);
4846         if (c->group)
4847                 fprintf(f, "%sGroup: %s\n", prefix, c->group);
4848
4849         fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
4850
4851         if (!strv_isempty(c->supplementary_groups)) {
4852                 fprintf(f, "%sSupplementaryGroups:", prefix);
4853                 strv_fprintf(f, c->supplementary_groups);
4854                 fputs("\n", f);
4855         }
4856
4857         if (c->pam_name)
4858                 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
4859
4860         if (!strv_isempty(c->read_write_paths)) {
4861                 fprintf(f, "%sReadWritePaths:", prefix);
4862                 strv_fprintf(f, c->read_write_paths);
4863                 fputs("\n", f);
4864         }
4865
4866         if (!strv_isempty(c->read_only_paths)) {
4867                 fprintf(f, "%sReadOnlyPaths:", prefix);
4868                 strv_fprintf(f, c->read_only_paths);
4869                 fputs("\n", f);
4870         }
4871
4872         if (!strv_isempty(c->inaccessible_paths)) {
4873                 fprintf(f, "%sInaccessiblePaths:", prefix);
4874                 strv_fprintf(f, c->inaccessible_paths);
4875                 fputs("\n", f);
4876         }
4877
4878         if (c->n_bind_mounts > 0)
4879                 for (i = 0; i < c->n_bind_mounts; i++)
4880                         fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
4881                                 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
4882                                 c->bind_mounts[i].ignore_enoent ? "-": "",
4883                                 c->bind_mounts[i].source,
4884                                 c->bind_mounts[i].destination,
4885                                 c->bind_mounts[i].recursive ? "rbind" : "norbind");
4886
4887         if (c->n_temporary_filesystems > 0)
4888                 for (i = 0; i < c->n_temporary_filesystems; i++) {
4889                         TemporaryFileSystem *t = c->temporary_filesystems + i;
4890
4891                         fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
4892                                 t->path,
4893                                 isempty(t->options) ? "" : ":",
4894                                 strempty(t->options));
4895                 }
4896
4897         if (c->utmp_id)
4898                 fprintf(f,
4899                         "%sUtmpIdentifier: %s\n",
4900                         prefix, c->utmp_id);
4901
4902         if (c->selinux_context)
4903                 fprintf(f,
4904                         "%sSELinuxContext: %s%s\n",
4905                         prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
4906
4907         if (c->apparmor_profile)
4908                 fprintf(f,
4909                         "%sAppArmorProfile: %s%s\n",
4910                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4911
4912         if (c->smack_process_label)
4913                 fprintf(f,
4914                         "%sSmackProcessLabel: %s%s\n",
4915                         prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
4916
4917         if (c->personality != PERSONALITY_INVALID)
4918                 fprintf(f,
4919                         "%sPersonality: %s\n",
4920                         prefix, strna(personality_to_string(c->personality)));
4921
4922         fprintf(f,
4923                 "%sLockPersonality: %s\n",
4924                 prefix, yes_no(c->lock_personality));
4925
4926         if (c->syscall_filter) {
4927 #if HAVE_SECCOMP
4928                 Iterator j;
4929                 void *id, *val;
4930                 bool first = true;
4931 #endif
4932
4933                 fprintf(f,
4934                         "%sSystemCallFilter: ",
4935                         prefix);
4936
4937                 if (!c->syscall_allow_list)
4938                         fputc('~', f);
4939
4940 #if HAVE_SECCOMP
4941                 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter, j) {
4942                         _cleanup_free_ char *name = NULL;
4943                         const char *errno_name = NULL;
4944                         int num = PTR_TO_INT(val);
4945
4946                         if (first)
4947                                 first = false;
4948                         else
4949                                 fputc(' ', f);
4950
4951                         name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
4952                         fputs(strna(name), f);
4953
4954                         if (num >= 0) {
4955                                 errno_name = errno_to_name(num);
4956                                 if (errno_name)
4957                                         fprintf(f, ":%s", errno_name);
4958                                 else
4959                                         fprintf(f, ":%d", num);
4960                         }
4961                 }
4962 #endif
4963
4964                 fputc('\n', f);
4965         }
4966
4967         if (c->syscall_archs) {
4968 #if HAVE_SECCOMP
4969                 Iterator j;
4970                 void *id;
4971 #endif
4972
4973                 fprintf(f,
4974                         "%sSystemCallArchitectures:",
4975                         prefix);
4976
4977 #if HAVE_SECCOMP
4978                 SET_FOREACH(id, c->syscall_archs, j)
4979                         fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
4980 #endif
4981                 fputc('\n', f);
4982         }
4983
4984         if (exec_context_restrict_namespaces_set(c)) {
4985                 _cleanup_free_ char *s = NULL;
4986
4987                 r = namespace_flags_to_string(c->restrict_namespaces, &s);
4988                 if (r >= 0)
4989                         fprintf(f, "%sRestrictNamespaces: %s\n",
4990                                 prefix, strna(s));
4991         }
4992
4993         if (c->network_namespace_path)
4994                 fprintf(f,
4995                         "%sNetworkNamespacePath: %s\n",
4996                         prefix, c->network_namespace_path);
4997
4998         if (c->syscall_errno > 0) {
4999                 const char *errno_name;
5000
5001                 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
5002
5003                 errno_name = errno_to_name(c->syscall_errno);
5004                 if (errno_name)
5005                         fprintf(f, "%s\n", errno_name);
5006                 else
5007                         fprintf(f, "%d\n", c->syscall_errno);
5008         }
5009 }
5010
5011 bool exec_context_maintains_privileges(const ExecContext *c) {
5012         assert(c);
5013
5014         /* Returns true if the process forked off would run under
5015          * an unchanged UID or as root. */
5016
5017         if (!c->user)
5018                 return true;
5019
5020         if (streq(c->user, "root") || streq(c->user, "0"))
5021                 return true;
5022
5023         return false;
5024 }
5025
5026 int exec_context_get_effective_ioprio(const ExecContext *c) {
5027         int p;
5028
5029         assert(c);
5030
5031         if (c->ioprio_set)
5032                 return c->ioprio;
5033
5034         p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
5035         if (p < 0)
5036                 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
5037
5038         return p;
5039 }
5040
5041 void exec_context_free_log_extra_fields(ExecContext *c) {
5042         size_t l;
5043
5044         assert(c);
5045
5046         for (l = 0; l < c->n_log_extra_fields; l++)
5047                 free(c->log_extra_fields[l].iov_base);
5048         c->log_extra_fields = mfree(c->log_extra_fields);
5049         c->n_log_extra_fields = 0;
5050 }
5051
5052 void exec_context_revert_tty(ExecContext *c) {
5053         int r;
5054
5055         assert(c);
5056
5057         /* First, reset the TTY (possibly kicking everybody else from the TTY) */
5058         exec_context_tty_reset(c, NULL);
5059
5060         /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
5061          * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
5062          * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
5063
5064         if (exec_context_may_touch_tty(c)) {
5065                 const char *path;
5066
5067                 path = exec_context_tty_path(c);
5068                 if (path) {
5069                         r = chmod_and_chown(path, TTY_MODE, 0, TTY_GID);
5070                         if (r < 0 && r != -ENOENT)
5071                                 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
5072                 }
5073         }
5074 }
5075
5076 int exec_context_get_clean_directories(
5077                 ExecContext *c,
5078                 char **prefix,
5079                 ExecCleanMask mask,
5080                 char ***ret) {
5081
5082         _cleanup_strv_free_ char **l = NULL;
5083         ExecDirectoryType t;
5084         int r;
5085
5086         assert(c);
5087         assert(prefix);
5088         assert(ret);
5089
5090         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
5091                 char **i;
5092
5093                 if (!FLAGS_SET(mask, 1U << t))
5094                         continue;
5095
5096                 if (!prefix[t])
5097                         continue;
5098
5099                 STRV_FOREACH(i, c->directories[t].paths) {
5100                         char *j;
5101
5102                         j = path_join(prefix[t], *i);
5103                         if (!j)
5104                                 return -ENOMEM;
5105
5106                         r = strv_consume(&l, j);
5107                         if (r < 0)
5108                                 return r;
5109
5110                         /* Also remove private directories unconditionally. */
5111                         if (t != EXEC_DIRECTORY_CONFIGURATION) {
5112                                 j = path_join(prefix[t], "private", *i);
5113                                 if (!j)
5114                                         return -ENOMEM;
5115
5116                                 r = strv_consume(&l, j);
5117                                 if (r < 0)
5118                                         return r;
5119                         }
5120                 }
5121         }
5122
5123         *ret = TAKE_PTR(l);
5124         return 0;
5125 }
5126
5127 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
5128         ExecCleanMask mask = 0;
5129
5130         assert(c);
5131         assert(ret);
5132
5133         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5134                 if (!strv_isempty(c->directories[t].paths))
5135                         mask |= 1U << t;
5136
5137         *ret = mask;
5138         return 0;
5139 }
5140
5141 void exec_status_start(ExecStatus *s, pid_t pid) {
5142         assert(s);
5143
5144         *s = (ExecStatus) {
5145                 .pid = pid,
5146         };
5147
5148         dual_timestamp_get(&s->start_timestamp);
5149 }
5150
5151 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
5152         assert(s);
5153
5154         if (s->pid != pid) {
5155                 *s = (ExecStatus) {
5156                         .pid = pid,
5157                 };
5158         }
5159
5160         dual_timestamp_get(&s->exit_timestamp);
5161
5162         s->code = code;
5163         s->status = status;
5164
5165         if (context && context->utmp_id)
5166                 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
5167 }
5168
5169 void exec_status_reset(ExecStatus *s) {
5170         assert(s);
5171
5172         *s = (ExecStatus) {};
5173 }
5174
5175 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
5176         char buf[FORMAT_TIMESTAMP_MAX];
5177
5178         assert(s);
5179         assert(f);
5180
5181         if (s->pid <= 0)
5182                 return;
5183
5184         prefix = strempty(prefix);
5185
5186         fprintf(f,
5187                 "%sPID: "PID_FMT"\n",
5188                 prefix, s->pid);
5189
5190         if (dual_timestamp_is_set(&s->start_timestamp))
5191                 fprintf(f,
5192                         "%sStart Timestamp: %s\n",
5193                         prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
5194
5195         if (dual_timestamp_is_set(&s->exit_timestamp))
5196                 fprintf(f,
5197                         "%sExit Timestamp: %s\n"
5198                         "%sExit Code: %s\n"
5199                         "%sExit Status: %i\n",
5200                         prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
5201                         prefix, sigchld_code_to_string(s->code),
5202                         prefix, s->status);
5203 }
5204
5205 static char *exec_command_line(char **argv) {
5206         size_t k;
5207         char *n, *p, **a;
5208         bool first = true;
5209
5210         assert(argv);
5211
5212         k = 1;
5213         STRV_FOREACH(a, argv)
5214                 k += strlen(*a)+3;
5215
5216         n = new(char, k);
5217         if (!n)
5218                 return NULL;
5219
5220         p = n;
5221         STRV_FOREACH(a, argv) {
5222
5223                 if (!first)
5224                         *(p++) = ' ';
5225                 else
5226                         first = false;
5227
5228                 if (strpbrk(*a, WHITESPACE)) {
5229                         *(p++) = '\'';
5230                         p = stpcpy(p, *a);
5231                         *(p++) = '\'';
5232                 } else
5233                         p = stpcpy(p, *a);
5234
5235         }
5236
5237         *p = 0;
5238
5239         /* FIXME: this doesn't really handle arguments that have
5240          * spaces and ticks in them */
5241
5242         return n;
5243 }
5244
5245 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
5246         _cleanup_free_ char *cmd = NULL;
5247         const char *prefix2;
5248
5249         assert(c);
5250         assert(f);
5251
5252         prefix = strempty(prefix);
5253         prefix2 = strjoina(prefix, "\t");
5254
5255         cmd = exec_command_line(c->argv);
5256         fprintf(f,
5257                 "%sCommand Line: %s\n",
5258                 prefix, cmd ? cmd : strerror_safe(ENOMEM));
5259
5260         exec_status_dump(&c->exec_status, f, prefix2);
5261 }
5262
5263 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
5264         assert(f);
5265
5266         prefix = strempty(prefix);
5267
5268         LIST_FOREACH(command, c, c)
5269                 exec_command_dump(c, f, prefix);
5270 }
5271
5272 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
5273         ExecCommand *end;
5274
5275         assert(l);
5276         assert(e);
5277
5278         if (*l) {
5279                 /* It's kind of important, that we keep the order here */
5280                 LIST_FIND_TAIL(command, *l, end);
5281                 LIST_INSERT_AFTER(command, *l, end, e);
5282         } else
5283               *l = e;
5284 }
5285
5286 int exec_command_set(ExecCommand *c, const char *path, ...) {
5287         va_list ap;
5288         char **l, *p;
5289
5290         assert(c);
5291         assert(path);
5292
5293         va_start(ap, path);
5294         l = strv_new_ap(path, ap);
5295         va_end(ap);
5296
5297         if (!l)
5298                 return -ENOMEM;
5299
5300         p = strdup(path);
5301         if (!p) {
5302                 strv_free(l);
5303                 return -ENOMEM;
5304         }
5305
5306         free_and_replace(c->path, p);
5307
5308         return strv_free_and_replace(c->argv, l);
5309 }
5310
5311 int exec_command_append(ExecCommand *c, const char *path, ...) {
5312         _cleanup_strv_free_ char **l = NULL;
5313         va_list ap;
5314         int r;
5315
5316         assert(c);
5317         assert(path);
5318
5319         va_start(ap, path);
5320         l = strv_new_ap(path, ap);
5321         va_end(ap);
5322
5323         if (!l)
5324                 return -ENOMEM;
5325
5326         r = strv_extend_strv(&c->argv, l, false);
5327         if (r < 0)
5328                 return r;
5329
5330         return 0;
5331 }
5332
5333 static void *remove_tmpdir_thread(void *p) {
5334         _cleanup_free_ char *path = p;
5335
5336         (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
5337         return NULL;
5338 }
5339
5340 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
5341         int r;
5342
5343         if (!rt)
5344                 return NULL;
5345
5346         if (rt->manager)
5347                 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
5348
5349         /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
5350         if (destroy && rt->tmp_dir) {
5351                 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
5352
5353                 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
5354                 if (r < 0) {
5355                         log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
5356                         free(rt->tmp_dir);
5357                 }
5358
5359                 rt->tmp_dir = NULL;
5360         }
5361
5362         if (destroy && rt->var_tmp_dir) {
5363                 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
5364
5365                 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
5366                 if (r < 0) {
5367                         log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
5368                         free(rt->var_tmp_dir);
5369                 }
5370
5371                 rt->var_tmp_dir = NULL;
5372         }
5373
5374         rt->id = mfree(rt->id);
5375         rt->tmp_dir = mfree(rt->tmp_dir);
5376         rt->var_tmp_dir = mfree(rt->var_tmp_dir);
5377         safe_close_pair(rt->netns_storage_socket);
5378         return mfree(rt);
5379 }
5380
5381 static void exec_runtime_freep(ExecRuntime **rt) {
5382         (void) exec_runtime_free(*rt, false);
5383 }
5384
5385 static int exec_runtime_allocate(ExecRuntime **ret) {
5386         ExecRuntime *n;
5387
5388         assert(ret);
5389
5390         n = new(ExecRuntime, 1);
5391         if (!n)
5392                 return -ENOMEM;
5393
5394         *n = (ExecRuntime) {
5395                 .netns_storage_socket = { -1, -1 },
5396         };
5397
5398         *ret = n;
5399         return 0;
5400 }
5401
5402 static int exec_runtime_add(
5403                 Manager *m,
5404                 const char *id,
5405                 const char *tmp_dir,
5406                 const char *var_tmp_dir,
5407                 const int netns_storage_socket[2],
5408                 ExecRuntime **ret) {
5409
5410         _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
5411         int r;
5412
5413         assert(m);
5414         assert(id);
5415
5416         r = hashmap_ensure_allocated(&m->exec_runtime_by_id, &string_hash_ops);
5417         if (r < 0)
5418                 return r;
5419
5420         r = exec_runtime_allocate(&rt);
5421         if (r < 0)
5422                 return r;
5423
5424         rt->id = strdup(id);
5425         if (!rt->id)
5426                 return -ENOMEM;
5427
5428         if (tmp_dir) {
5429                 rt->tmp_dir = strdup(tmp_dir);
5430                 if (!rt->tmp_dir)
5431                         return -ENOMEM;
5432
5433                 /* When tmp_dir is set, then we require var_tmp_dir is also set. */
5434                 assert(var_tmp_dir);
5435                 rt->var_tmp_dir = strdup(var_tmp_dir);
5436                 if (!rt->var_tmp_dir)
5437                         return -ENOMEM;
5438         }
5439
5440         if (netns_storage_socket) {
5441                 rt->netns_storage_socket[0] = netns_storage_socket[0];
5442                 rt->netns_storage_socket[1] = netns_storage_socket[1];
5443         }
5444
5445         r = hashmap_put(m->exec_runtime_by_id, rt->id, rt);
5446         if (r < 0)
5447                 return r;
5448
5449         rt->manager = m;
5450
5451         if (ret)
5452                 *ret = rt;
5453
5454         /* do not remove created ExecRuntime object when the operation succeeds. */
5455         rt = NULL;
5456         return 0;
5457 }
5458
5459 static int exec_runtime_make(Manager *m, const ExecContext *c, const char *id, ExecRuntime **ret) {
5460         _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
5461         _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 };
5462         int r;
5463
5464         assert(m);
5465         assert(c);
5466         assert(id);
5467
5468         /* It is not necessary to create ExecRuntime object. */
5469         if (!c->private_network && !c->private_tmp && !c->network_namespace_path)
5470                 return 0;
5471
5472         if (c->private_tmp &&
5473             !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
5474               (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
5475                prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
5476                 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
5477                 if (r < 0)
5478                         return r;
5479         }
5480
5481         if (c->private_network || c->network_namespace_path) {
5482                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
5483                         return -errno;
5484         }
5485
5486         r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, netns_storage_socket, ret);
5487         if (r < 0)
5488                 return r;
5489
5490         /* Avoid cleanup */
5491         netns_storage_socket[0] = netns_storage_socket[1] = -1;
5492         return 1;
5493 }
5494
5495 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
5496         ExecRuntime *rt;
5497         int r;
5498
5499         assert(m);
5500         assert(id);
5501         assert(ret);
5502
5503         rt = hashmap_get(m->exec_runtime_by_id, id);
5504         if (rt)
5505                 /* We already have a ExecRuntime object, let's increase the ref count and reuse it */
5506                 goto ref;
5507
5508         if (!create)
5509                 return 0;
5510
5511         /* If not found, then create a new object. */
5512         r = exec_runtime_make(m, c, id, &rt);
5513         if (r <= 0)
5514                 /* When r == 0, it is not necessary to create ExecRuntime object. */
5515                 return r;
5516
5517 ref:
5518         /* increment reference counter. */
5519         rt->n_ref++;
5520         *ret = rt;
5521         return 1;
5522 }
5523
5524 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
5525         if (!rt)
5526                 return NULL;
5527
5528         assert(rt->n_ref > 0);
5529
5530         rt->n_ref--;
5531         if (rt->n_ref > 0)
5532                 return NULL;
5533
5534         return exec_runtime_free(rt, destroy);
5535 }
5536
5537 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
5538         ExecRuntime *rt;
5539         Iterator i;
5540
5541         assert(m);
5542         assert(f);
5543         assert(fds);
5544
5545         HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
5546                 fprintf(f, "exec-runtime=%s", rt->id);
5547
5548                 if (rt->tmp_dir)
5549                         fprintf(f, " tmp-dir=%s", rt->tmp_dir);
5550
5551                 if (rt->var_tmp_dir)
5552                         fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
5553
5554                 if (rt->netns_storage_socket[0] >= 0) {
5555                         int copy;
5556
5557                         copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
5558                         if (copy < 0)
5559                                 return copy;
5560
5561                         fprintf(f, " netns-socket-0=%i", copy);
5562                 }
5563
5564                 if (rt->netns_storage_socket[1] >= 0) {
5565                         int copy;
5566
5567                         copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
5568                         if (copy < 0)
5569                                 return copy;
5570
5571                         fprintf(f, " netns-socket-1=%i", copy);
5572                 }
5573
5574                 fputc('\n', f);
5575         }
5576
5577         return 0;
5578 }
5579
5580 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
5581         _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
5582         ExecRuntime *rt;
5583         int r;
5584
5585         /* This is for the migration from old (v237 or earlier) deserialization text.
5586          * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
5587          * Even if the ExecRuntime object originally created by the other unit, we cannot judge
5588          * so or not from the serialized text, then we always creates a new object owned by this. */
5589
5590         assert(u);
5591         assert(key);
5592         assert(value);
5593
5594         /* Manager manages ExecRuntime objects by the unit id.
5595          * So, we omit the serialized text when the unit does not have id (yet?)... */
5596         if (isempty(u->id)) {
5597                 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
5598                 return 0;
5599         }
5600
5601         r = hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops);
5602         if (r < 0) {
5603                 log_unit_debug_errno(u, r, "Failed to allocate storage for runtime parameter: %m");
5604                 return 0;
5605         }
5606
5607         rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
5608         if (!rt) {
5609                 r = exec_runtime_allocate(&rt_create);
5610                 if (r < 0)
5611                         return log_oom();
5612
5613                 rt_create->id = strdup(u->id);
5614                 if (!rt_create->id)
5615                         return log_oom();
5616
5617                 rt = rt_create;
5618         }
5619
5620         if (streq(key, "tmp-dir")) {
5621                 char *copy;
5622
5623                 copy = strdup(value);
5624                 if (!copy)
5625                         return log_oom();
5626
5627                 free_and_replace(rt->tmp_dir, copy);
5628
5629         } else if (streq(key, "var-tmp-dir")) {
5630                 char *copy;
5631
5632                 copy = strdup(value);
5633                 if (!copy)
5634                         return log_oom();
5635
5636                 free_and_replace(rt->var_tmp_dir, copy);
5637
5638         } else if (streq(key, "netns-socket-0")) {
5639                 int fd;
5640
5641                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
5642                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
5643                         return 0;
5644                 }
5645
5646                 safe_close(rt->netns_storage_socket[0]);
5647                 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
5648
5649         } else if (streq(key, "netns-socket-1")) {
5650                 int fd;
5651
5652                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
5653                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
5654                         return 0;
5655                 }
5656
5657                 safe_close(rt->netns_storage_socket[1]);
5658                 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
5659         } else
5660                 return 0;
5661
5662         /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
5663         if (rt_create) {
5664                 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
5665                 if (r < 0) {
5666                         log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
5667                         return 0;
5668                 }
5669
5670                 rt_create->manager = u->manager;
5671
5672                 /* Avoid cleanup */
5673                 rt_create = NULL;
5674         }
5675
5676         return 1;
5677 }
5678
5679 void exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
5680         char *id = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
5681         int r, fd0 = -1, fd1 = -1;
5682         const char *p, *v = value;
5683         size_t n;
5684
5685         assert(m);
5686         assert(value);
5687         assert(fds);
5688
5689         n = strcspn(v, " ");
5690         id = strndupa(v, n);
5691         if (v[n] != ' ')
5692                 goto finalize;
5693         p = v + n + 1;
5694
5695         v = startswith(p, "tmp-dir=");
5696         if (v) {
5697                 n = strcspn(v, " ");
5698                 tmp_dir = strndupa(v, n);
5699                 if (v[n] != ' ')
5700                         goto finalize;
5701                 p = v + n + 1;
5702         }
5703
5704         v = startswith(p, "var-tmp-dir=");
5705         if (v) {
5706                 n = strcspn(v, " ");
5707                 var_tmp_dir = strndupa(v, n);
5708                 if (v[n] != ' ')
5709                         goto finalize;
5710                 p = v + n + 1;
5711         }
5712
5713         v = startswith(p, "netns-socket-0=");
5714         if (v) {
5715                 char *buf;
5716
5717                 n = strcspn(v, " ");
5718                 buf = strndupa(v, n);
5719                 if (safe_atoi(buf, &fd0) < 0 || !fdset_contains(fds, fd0)) {
5720                         log_debug("Unable to process exec-runtime netns fd specification.");
5721                         return;
5722                 }
5723                 fd0 = fdset_remove(fds, fd0);
5724                 if (v[n] != ' ')
5725                         goto finalize;
5726                 p = v + n + 1;
5727         }
5728
5729         v = startswith(p, "netns-socket-1=");
5730         if (v) {
5731                 char *buf;
5732
5733                 n = strcspn(v, " ");
5734                 buf = strndupa(v, n);
5735                 if (safe_atoi(buf, &fd1) < 0 || !fdset_contains(fds, fd1)) {
5736                         log_debug("Unable to process exec-runtime netns fd specification.");
5737                         return;
5738                 }
5739                 fd1 = fdset_remove(fds, fd1);
5740         }
5741
5742 finalize:
5743
5744         r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, (int[]) { fd0, fd1 }, NULL);
5745         if (r < 0)
5746                 log_debug_errno(r, "Failed to add exec-runtime: %m");
5747 }
5748
5749 void exec_runtime_vacuum(Manager *m) {
5750         ExecRuntime *rt;
5751         Iterator i;
5752
5753         assert(m);
5754
5755         /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
5756
5757         HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
5758                 if (rt->n_ref > 0)
5759                         continue;
5760
5761                 (void) exec_runtime_free(rt, false);
5762         }
5763 }
5764
5765 void exec_params_clear(ExecParameters *p) {
5766         if (!p)
5767                 return;
5768
5769         strv_free(p->environment);
5770 }
5771
5772 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
5773         [EXEC_INPUT_NULL] = "null",
5774         [EXEC_INPUT_TTY] = "tty",
5775         [EXEC_INPUT_TTY_FORCE] = "tty-force",
5776         [EXEC_INPUT_TTY_FAIL] = "tty-fail",
5777         [EXEC_INPUT_SOCKET] = "socket",
5778         [EXEC_INPUT_NAMED_FD] = "fd",
5779         [EXEC_INPUT_DATA] = "data",
5780         [EXEC_INPUT_FILE] = "file",
5781 };
5782
5783 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
5784
5785 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
5786         [EXEC_OUTPUT_INHERIT] = "inherit",
5787         [EXEC_OUTPUT_NULL] = "null",
5788         [EXEC_OUTPUT_TTY] = "tty",
5789         [EXEC_OUTPUT_KMSG] = "kmsg",
5790         [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
5791         [EXEC_OUTPUT_JOURNAL] = "journal",
5792         [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
5793         [EXEC_OUTPUT_SOCKET] = "socket",
5794         [EXEC_OUTPUT_NAMED_FD] = "fd",
5795         [EXEC_OUTPUT_FILE] = "file",
5796         [EXEC_OUTPUT_FILE_APPEND] = "append",
5797 };
5798
5799 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
5800
5801 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
5802         [EXEC_UTMP_INIT] = "init",
5803         [EXEC_UTMP_LOGIN] = "login",
5804         [EXEC_UTMP_USER] = "user",
5805 };
5806
5807 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
5808
5809 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
5810         [EXEC_PRESERVE_NO] = "no",
5811         [EXEC_PRESERVE_YES] = "yes",
5812         [EXEC_PRESERVE_RESTART] = "restart",
5813 };
5814
5815 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
5816
5817 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
5818 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5819         [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
5820         [EXEC_DIRECTORY_STATE] = "StateDirectory",
5821         [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
5822         [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
5823         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
5824 };
5825
5826 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
5827
5828 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
5829  * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
5830  * directories, specifically .timer units with their timestamp touch file. */
5831 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5832         [EXEC_DIRECTORY_RUNTIME] = "runtime",
5833         [EXEC_DIRECTORY_STATE] = "state",
5834         [EXEC_DIRECTORY_CACHE] = "cache",
5835         [EXEC_DIRECTORY_LOGS] = "logs",
5836         [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
5837 };
5838
5839 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
5840
5841 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
5842  * the service payload in. */
5843 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5844         [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
5845         [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
5846         [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
5847         [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
5848         [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
5849 };
5850
5851 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
5852
5853 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
5854         [EXEC_KEYRING_INHERIT] = "inherit",
5855         [EXEC_KEYRING_PRIVATE] = "private",
5856         [EXEC_KEYRING_SHARED] = "shared",
5857 };
5858
5859 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);