src/core/execute.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <errno.h>
   4 #include <fcntl.h>
   5 #include <poll.h>
   6 #include <sys/eventfd.h>
   7 #include <sys/ioctl.h>
   8 #include <sys/mman.h>
   9 #include <sys/mount.h>
  10 #include <sys/personality.h>
  11 #include <sys/prctl.h>
  12 #include <sys/shm.h>
  13 #include <sys/types.h>
  14 #include <sys/un.h>
  15 #include <unistd.h>
  16 #include <utmpx.h>
  17
  18 #if HAVE_PAM
  19 #include <security/pam_appl.h>
  20 #endif
  21
  22 #if HAVE_SELINUX
  23 #include <selinux/selinux.h>
  24 #endif
  25
  26 #if HAVE_SECCOMP
  27 #include <seccomp.h>
  28 #endif
  29
  30 #if HAVE_APPARMOR
  31 #include <sys/apparmor.h>
  32 #endif
  33
  34 #include "sd-messages.h"
  35
  36 #include "acl-util.h"
  37 #include "af-list.h"
  38 #include "alloc-util.h"
  39 #if HAVE_APPARMOR
  40 #include "apparmor-util.h"
  41 #endif
  42 #include "async.h"
  43 #include "barrier.h"
  44 #include "bpf-lsm.h"
  45 #include "cap-list.h"
  46 #include "capability-util.h"
  47 #include "cgroup-setup.h"
  48 #include "chase-symlinks.h"
  49 #include "chown-recursive.h"
  50 #include "cpu-set-util.h"
  51 #include "creds-util.h"
  52 #include "data-fd-util.h"
  53 #include "def.h"
  54 #include "env-file.h"
  55 #include "env-util.h"
  56 #include "errno-list.h"
  57 #include "escape.h"
  58 #include "execute.h"
  59 #include "exit-status.h"
  60 #include "fd-util.h"
  61 #include "fileio.h"
  62 #include "format-util.h"
  63 #include "glob-util.h"
  64 #include "hexdecoct.h"
  65 #include "io-util.h"
  66 #include "label.h"
  67 #include "log.h"
  68 #include "macro.h"
  69 #include "manager.h"
  70 #include "manager-dump.h"
  71 #include "memory-util.h"
  72 #include "missing_fs.h"
  73 #include "missing_ioprio.h"
  74 #include "mkdir.h"
  75 #include "mount-util.h"
  76 #include "mountpoint-util.h"
  77 #include "namespace.h"
  78 #include "parse-util.h"
  79 #include "path-util.h"
  80 #include "process-util.h"
  81 #include "random-util.h"
  82 #include "rlimit-util.h"
  83 #include "rm-rf.h"
  84 #if HAVE_SECCOMP
  85 #include "seccomp-util.h"
  86 #endif
  87 #include "securebits-util.h"
  88 #include "selinux-util.h"
  89 #include "signal-util.h"
  90 #include "smack-util.h"
  91 #include "socket-util.h"
  92 #include "special.h"
  93 #include "stat-util.h"
  94 #include "string-table.h"
  95 #include "string-util.h"
  96 #include "strv.h"
  97 #include "syslog-util.h"
  98 #include "terminal-util.h"
  99 #include "tmpfile-util.h"
 100 #include "umask-util.h"
 101 #include "unit-serialize.h"
 102 #include "user-util.h"
 103 #include "utmp-wtmp.h"
 104
 105 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
 106 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
 107
 108 #define SNDBUF_SIZE (8*1024*1024)
 109
 110 static int shift_fds(int fds[], size_t n_fds) {
 111         if (n_fds <= 0)
 112                 return 0;
 113
 114         /* Modifies the fds array! (sorts it) */
 115
 116         assert(fds);
 117
 118         for (int start = 0;;) {
 119                 int restart_from = -1;
 120
 121                 for (int i = start; i < (int) n_fds; i++) {
 122                         int nfd;
 123
 124                         /* Already at right index? */
 125                         if (fds[i] == i+3)
 126                                 continue;
 127
 128                         nfd = fcntl(fds[i], F_DUPFD, i + 3);
 129                         if (nfd < 0)
 130                                 return -errno;
 131
 132                         safe_close(fds[i]);
 133                         fds[i] = nfd;
 134
 135                         /* Hmm, the fd we wanted isn't free? Then
 136                          * let's remember that and try again from here */
 137                         if (nfd != i+3 && restart_from < 0)
 138                                 restart_from = i;
 139                 }
 140
 141                 if (restart_from < 0)
 142                         break;
 143
 144                 start = restart_from;
 145         }
 146
 147         return 0;
 148 }
 149
 150 static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
 151         size_t n_fds;
 152         int r;
 153
 154         n_fds = n_socket_fds + n_storage_fds;
 155         if (n_fds <= 0)
 156                 return 0;
 157
 158         assert(fds);
 159
 160         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
 161          * O_NONBLOCK only applies to socket activation though. */
 162
 163         for (size_t i = 0; i < n_fds; i++) {
 164
 165                 if (i < n_socket_fds) {
 166                         r = fd_nonblock(fds[i], nonblock);
 167                         if (r < 0)
 168                                 return r;
 169                 }
 170
 171                 /* We unconditionally drop FD_CLOEXEC from the fds,
 172                  * since after all we want to pass these fds to our
 173                  * children */
 174
 175                 r = fd_cloexec(fds[i], false);
 176                 if (r < 0)
 177                         return r;
 178         }
 179
 180         return 0;
 181 }
 182
 183 static const char *exec_context_tty_path(const ExecContext *context) {
 184         assert(context);
 185
 186         if (context->stdio_as_fds)
 187                 return NULL;
 188
 189         if (context->tty_path)
 190                 return context->tty_path;
 191
 192         return "/dev/console";
 193 }
 194
 195 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
 196         const char *path;
 197
 198         assert(context);
 199
 200         path = exec_context_tty_path(context);
 201
 202         if (context->tty_vhangup) {
 203                 if (p && p->stdin_fd >= 0)
 204                         (void) terminal_vhangup_fd(p->stdin_fd);
 205                 else if (path)
 206                         (void) terminal_vhangup(path);
 207         }
 208
 209         if (context->tty_reset) {
 210                 if (p && p->stdin_fd >= 0)
 211                         (void) reset_terminal_fd(p->stdin_fd, true);
 212                 else if (path)
 213                         (void) reset_terminal(path);
 214         }
 215
 216         if (context->tty_vt_disallocate && path)
 217                 (void) vt_disallocate(path);
 218 }
 219
 220 static bool is_terminal_input(ExecInput i) {
 221         return IN_SET(i,
 222                       EXEC_INPUT_TTY,
 223                       EXEC_INPUT_TTY_FORCE,
 224                       EXEC_INPUT_TTY_FAIL);
 225 }
 226
 227 static bool is_terminal_output(ExecOutput o) {
 228         return IN_SET(o,
 229                       EXEC_OUTPUT_TTY,
 230                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 231                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 232 }
 233
 234 static bool is_kmsg_output(ExecOutput o) {
 235         return IN_SET(o,
 236                       EXEC_OUTPUT_KMSG,
 237                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 238 }
 239
 240 static bool exec_context_needs_term(const ExecContext *c) {
 241         assert(c);
 242
 243         /* Return true if the execution context suggests we should set $TERM to something useful. */
 244
 245         if (is_terminal_input(c->std_input))
 246                 return true;
 247
 248         if (is_terminal_output(c->std_output))
 249                 return true;
 250
 251         if (is_terminal_output(c->std_error))
 252                 return true;
 253
 254         return !!c->tty_path;
 255 }
 256
 257 static int open_null_as(int flags, int nfd) {
 258         int fd;
 259
 260         assert(nfd >= 0);
 261
 262         fd = open("/dev/null", flags|O_NOCTTY);
 263         if (fd < 0)
 264                 return -errno;
 265
 266         return move_fd(fd, nfd, false);
 267 }
 268
 269 static int connect_journal_socket(
 270                 int fd,
 271                 const char *log_namespace,
 272                 uid_t uid,
 273                 gid_t gid) {
 274
 275         union sockaddr_union sa;
 276         socklen_t sa_len;
 277         uid_t olduid = UID_INVALID;
 278         gid_t oldgid = GID_INVALID;
 279         const char *j;
 280         int r;
 281
 282         j = log_namespace ?
 283                 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
 284                 "/run/systemd/journal/stdout";
 285         r = sockaddr_un_set_path(&sa.un, j);
 286         if (r < 0)
 287                 return r;
 288         sa_len = r;
 289
 290         if (gid_is_valid(gid)) {
 291                 oldgid = getgid();
 292
 293                 if (setegid(gid) < 0)
 294                         return -errno;
 295         }
 296
 297         if (uid_is_valid(uid)) {
 298                 olduid = getuid();
 299
 300                 if (seteuid(uid) < 0) {
 301                         r = -errno;
 302                         goto restore_gid;
 303                 }
 304         }
 305
 306         r = connect(fd, &sa.sa, sa_len) < 0 ? -errno : 0;
 307
 308         /* If we fail to restore the uid or gid, things will likely
 309            fail later on. This should only happen if an LSM interferes. */
 310
 311         if (uid_is_valid(uid))
 312                 (void) seteuid(olduid);
 313
 314  restore_gid:
 315         if (gid_is_valid(gid))
 316                 (void) setegid(oldgid);
 317
 318         return r;
 319 }
 320
 321 static int connect_logger_as(
 322                 const Unit *unit,
 323                 const ExecContext *context,
 324                 const ExecParameters *params,
 325                 ExecOutput output,
 326                 const char *ident,
 327                 int nfd,
 328                 uid_t uid,
 329                 gid_t gid) {
 330
 331         _cleanup_close_ int fd = -1;
 332         int r;
 333
 334         assert(context);
 335         assert(params);
 336         assert(output < _EXEC_OUTPUT_MAX);
 337         assert(ident);
 338         assert(nfd >= 0);
 339
 340         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 341         if (fd < 0)
 342                 return -errno;
 343
 344         r = connect_journal_socket(fd, context->log_namespace, uid, gid);
 345         if (r < 0)
 346                 return r;
 347
 348         if (shutdown(fd, SHUT_RD) < 0)
 349                 return -errno;
 350
 351         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 352
 353         if (dprintf(fd,
 354                 "%s\n"
 355                 "%s\n"
 356                 "%i\n"
 357                 "%i\n"
 358                 "%i\n"
 359                 "%i\n"
 360                 "%i\n",
 361                 context->syslog_identifier ?: ident,
 362                 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
 363                 context->syslog_priority,
 364                 !!context->syslog_level_prefix,
 365                 false,
 366                 is_kmsg_output(output),
 367                 is_terminal_output(output)) < 0)
 368                 return -errno;
 369
 370         return move_fd(TAKE_FD(fd), nfd, false);
 371 }
 372
 373 static int open_terminal_as(const char *path, int flags, int nfd) {
 374         int fd;
 375
 376         assert(path);
 377         assert(nfd >= 0);
 378
 379         fd = open_terminal(path, flags | O_NOCTTY);
 380         if (fd < 0)
 381                 return fd;
 382
 383         return move_fd(fd, nfd, false);
 384 }
 385
 386 static int acquire_path(const char *path, int flags, mode_t mode) {
 387         union sockaddr_union sa;
 388         socklen_t sa_len;
 389         _cleanup_close_ int fd = -1;
 390         int r;
 391
 392         assert(path);
 393
 394         if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
 395                 flags |= O_CREAT;
 396
 397         fd = open(path, flags|O_NOCTTY, mode);
 398         if (fd >= 0)
 399                 return TAKE_FD(fd);
 400
 401         if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
 402                 return -errno;
 403
 404         /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
 405
 406         r = sockaddr_un_set_path(&sa.un, path);
 407         if (r < 0)
 408                 return r == -EINVAL ? -ENXIO : r;
 409         sa_len = r;
 410
 411         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 412         if (fd < 0)
 413                 return -errno;
 414
 415         if (connect(fd, &sa.sa, sa_len) < 0)
 416                 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
 417                                                            * indication that this wasn't an AF_UNIX socket after all */
 418
 419         if ((flags & O_ACCMODE) == O_RDONLY)
 420                 r = shutdown(fd, SHUT_WR);
 421         else if ((flags & O_ACCMODE) == O_WRONLY)
 422                 r = shutdown(fd, SHUT_RD);
 423         else
 424                 r = 0;
 425         if (r < 0)
 426                 return -errno;
 427
 428         return TAKE_FD(fd);
 429 }
 430
 431 static int fixup_input(
 432                 const ExecContext *context,
 433                 int socket_fd,
 434                 bool apply_tty_stdin) {
 435
 436         ExecInput std_input;
 437
 438         assert(context);
 439
 440         std_input = context->std_input;
 441
 442         if (is_terminal_input(std_input) && !apply_tty_stdin)
 443                 return EXEC_INPUT_NULL;
 444
 445         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 446                 return EXEC_INPUT_NULL;
 447
 448         if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
 449                 return EXEC_INPUT_NULL;
 450
 451         return std_input;
 452 }
 453
 454 static int fixup_output(ExecOutput output, int socket_fd) {
 455
 456         if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 457                 return EXEC_OUTPUT_INHERIT;
 458
 459         return output;
 460 }
 461
 462 static int setup_input(
 463                 const ExecContext *context,
 464                 const ExecParameters *params,
 465                 int socket_fd,
 466                 const int named_iofds[static 3]) {
 467
 468         ExecInput i;
 469
 470         assert(context);
 471         assert(params);
 472         assert(named_iofds);
 473
 474         if (params->stdin_fd >= 0) {
 475                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 476                         return -errno;
 477
 478                 /* Try to make this the controlling tty, if it is a tty, and reset it */
 479                 if (isatty(STDIN_FILENO)) {
 480                         (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 481                         (void) reset_terminal_fd(STDIN_FILENO, true);
 482                 }
 483
 484                 return STDIN_FILENO;
 485         }
 486
 487         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 488
 489         switch (i) {
 490
 491         case EXEC_INPUT_NULL:
 492                 return open_null_as(O_RDONLY, STDIN_FILENO);
 493
 494         case EXEC_INPUT_TTY:
 495         case EXEC_INPUT_TTY_FORCE:
 496         case EXEC_INPUT_TTY_FAIL: {
 497                 int fd;
 498
 499                 fd = acquire_terminal(exec_context_tty_path(context),
 500                                       i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
 501                                       i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
 502                                                                   ACQUIRE_TERMINAL_WAIT,
 503                                       USEC_INFINITY);
 504                 if (fd < 0)
 505                         return fd;
 506
 507                 return move_fd(fd, STDIN_FILENO, false);
 508         }
 509
 510         case EXEC_INPUT_SOCKET:
 511                 assert(socket_fd >= 0);
 512
 513                 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 514
 515         case EXEC_INPUT_NAMED_FD:
 516                 assert(named_iofds[STDIN_FILENO] >= 0);
 517
 518                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 519                 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 520
 521         case EXEC_INPUT_DATA: {
 522                 int fd;
 523
 524                 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
 525                 if (fd < 0)
 526                         return fd;
 527
 528                 return move_fd(fd, STDIN_FILENO, false);
 529         }
 530
 531         case EXEC_INPUT_FILE: {
 532                 bool rw;
 533                 int fd;
 534
 535                 assert(context->stdio_file[STDIN_FILENO]);
 536
 537                 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
 538                         (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
 539
 540                 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
 541                 if (fd < 0)
 542                         return fd;
 543
 544                 return move_fd(fd, STDIN_FILENO, false);
 545         }
 546
 547         default:
 548                 assert_not_reached();
 549         }
 550 }
 551
 552 static bool can_inherit_stderr_from_stdout(
 553                 const ExecContext *context,
 554                 ExecOutput o,
 555                 ExecOutput e) {
 556
 557         assert(context);
 558
 559         /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
 560          * stderr fd */
 561
 562         if (e == EXEC_OUTPUT_INHERIT)
 563                 return true;
 564         if (e != o)
 565                 return false;
 566
 567         if (e == EXEC_OUTPUT_NAMED_FD)
 568                 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
 569
 570         if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
 571                 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
 572
 573         return true;
 574 }
 575
 576 static int setup_output(
 577                 const Unit *unit,
 578                 const ExecContext *context,
 579                 const ExecParameters *params,
 580                 int fileno,
 581                 int socket_fd,
 582                 const int named_iofds[static 3],
 583                 const char *ident,
 584                 uid_t uid,
 585                 gid_t gid,
 586                 dev_t *journal_stream_dev,
 587                 ino_t *journal_stream_ino) {
 588
 589         ExecOutput o;
 590         ExecInput i;
 591         int r;
 592
 593         assert(unit);
 594         assert(context);
 595         assert(params);
 596         assert(ident);
 597         assert(journal_stream_dev);
 598         assert(journal_stream_ino);
 599
 600         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 601
 602                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 603                         return -errno;
 604
 605                 return STDOUT_FILENO;
 606         }
 607
 608         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 609                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 610                         return -errno;
 611
 612                 return STDERR_FILENO;
 613         }
 614
 615         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 616         o = fixup_output(context->std_output, socket_fd);
 617
 618         if (fileno == STDERR_FILENO) {
 619                 ExecOutput e;
 620                 e = fixup_output(context->std_error, socket_fd);
 621
 622                 /* This expects the input and output are already set up */
 623
 624                 /* Don't change the stderr file descriptor if we inherit all
 625                  * the way and are not on a tty */
 626                 if (e == EXEC_OUTPUT_INHERIT &&
 627                     o == EXEC_OUTPUT_INHERIT &&
 628                     i == EXEC_INPUT_NULL &&
 629                     !is_terminal_input(context->std_input) &&
 630                     getppid() != 1)
 631                         return fileno;
 632
 633                 /* Duplicate from stdout if possible */
 634                 if (can_inherit_stderr_from_stdout(context, o, e))
 635                         return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
 636
 637                 o = e;
 638
 639         } else if (o == EXEC_OUTPUT_INHERIT) {
 640                 /* If input got downgraded, inherit the original value */
 641                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 642                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 643
 644                 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
 645                 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
 646                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 647
 648                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 649                 if (getppid() != 1)
 650                         return fileno;
 651
 652                 /* We need to open /dev/null here anew, to get the right access mode. */
 653                 return open_null_as(O_WRONLY, fileno);
 654         }
 655
 656         switch (o) {
 657
 658         case EXEC_OUTPUT_NULL:
 659                 return open_null_as(O_WRONLY, fileno);
 660
 661         case EXEC_OUTPUT_TTY:
 662                 if (is_terminal_input(i))
 663                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 664
 665                 /* We don't reset the terminal if this is just about output */
 666                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 667
 668         case EXEC_OUTPUT_KMSG:
 669         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 670         case EXEC_OUTPUT_JOURNAL:
 671         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 672                 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
 673                 if (r < 0) {
 674                         log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m",
 675                                                fileno == STDOUT_FILENO ? "stdout" : "stderr");
 676                         r = open_null_as(O_WRONLY, fileno);
 677                 } else {
 678                         struct stat st;
 679
 680                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 681                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 682                          * services to detect whether they are connected to the journal or not.
 683                          *
 684                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
 685                          * about STDERR as that's usually the best way to do logging. */
 686
 687                         if (fstat(fileno, &st) >= 0 &&
 688                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
 689                                 *journal_stream_dev = st.st_dev;
 690                                 *journal_stream_ino = st.st_ino;
 691                         }
 692                 }
 693                 return r;
 694
 695         case EXEC_OUTPUT_SOCKET:
 696                 assert(socket_fd >= 0);
 697
 698                 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
 699
 700         case EXEC_OUTPUT_NAMED_FD:
 701                 assert(named_iofds[fileno] >= 0);
 702
 703                 (void) fd_nonblock(named_iofds[fileno], false);
 704                 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
 705
 706         case EXEC_OUTPUT_FILE:
 707         case EXEC_OUTPUT_FILE_APPEND:
 708         case EXEC_OUTPUT_FILE_TRUNCATE: {
 709                 bool rw;
 710                 int fd, flags;
 711
 712                 assert(context->stdio_file[fileno]);
 713
 714                 rw = context->std_input == EXEC_INPUT_FILE &&
 715                         streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
 716
 717                 if (rw)
 718                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 719
 720                 flags = O_WRONLY;
 721                 if (o == EXEC_OUTPUT_FILE_APPEND)
 722                         flags |= O_APPEND;
 723                 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
 724                         flags |= O_TRUNC;
 725
 726                 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
 727                 if (fd < 0)
 728                         return fd;
 729
 730                 return move_fd(fd, fileno, 0);
 731         }
 732
 733         default:
 734                 assert_not_reached();
 735         }
 736 }
 737
 738 static int chown_terminal(int fd, uid_t uid) {
 739         int r;
 740
 741         assert(fd >= 0);
 742
 743         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 744         if (isatty(fd) < 1) {
 745                 if (IN_SET(errno, EINVAL, ENOTTY))
 746                         return 0; /* not a tty */
 747
 748                 return -errno;
 749         }
 750
 751         /* This might fail. What matters are the results. */
 752         r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
 753         if (r < 0)
 754                 return r;
 755
 756         return 1;
 757 }
 758
 759 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
 760         _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
 761         int r;
 762
 763         assert(_saved_stdin);
 764         assert(_saved_stdout);
 765
 766         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 767         if (saved_stdin < 0)
 768                 return -errno;
 769
 770         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
 771         if (saved_stdout < 0)
 772                 return -errno;
 773
 774         fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
 775         if (fd < 0)
 776                 return fd;
 777
 778         r = chown_terminal(fd, getuid());
 779         if (r < 0)
 780                 return r;
 781
 782         r = reset_terminal_fd(fd, true);
 783         if (r < 0)
 784                 return r;
 785
 786         r = rearrange_stdio(fd, fd, STDERR_FILENO);
 787         fd = -1;
 788         if (r < 0)
 789                 return r;
 790
 791         *_saved_stdin = saved_stdin;
 792         *_saved_stdout = saved_stdout;
 793
 794         saved_stdin = saved_stdout = -1;
 795
 796         return 0;
 797 }
 798
 799 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
 800         assert(err < 0);
 801
 802         if (err == -ETIMEDOUT)
 803                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
 804         else {
 805                 errno = -err;
 806                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
 807         }
 808 }
 809
 810 static void write_confirm_error(int err, const char *vc, const Unit *u) {
 811         _cleanup_close_ int fd = -1;
 812
 813         assert(vc);
 814
 815         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 816         if (fd < 0)
 817                 return;
 818
 819         write_confirm_error_fd(err, fd, u);
 820 }
 821
 822 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 823         int r = 0;
 824
 825         assert(saved_stdin);
 826         assert(saved_stdout);
 827
 828         release_terminal();
 829
 830         if (*saved_stdin >= 0)
 831                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 832                         r = -errno;
 833
 834         if (*saved_stdout >= 0)
 835                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 836                         r = -errno;
 837
 838         *saved_stdin = safe_close(*saved_stdin);
 839         *saved_stdout = safe_close(*saved_stdout);
 840
 841         return r;
 842 }
 843
 844 enum {
 845         CONFIRM_PRETEND_FAILURE = -1,
 846         CONFIRM_PRETEND_SUCCESS =  0,
 847         CONFIRM_EXECUTE = 1,
 848 };
 849
 850 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
 851         int saved_stdout = -1, saved_stdin = -1, r;
 852         _cleanup_free_ char *e = NULL;
 853         char c;
 854
 855         /* For any internal errors, assume a positive response. */
 856         r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
 857         if (r < 0) {
 858                 write_confirm_error(r, vc, u);
 859                 return CONFIRM_EXECUTE;
 860         }
 861
 862         /* confirm_spawn might have been disabled while we were sleeping. */
 863         if (manager_is_confirm_spawn_disabled(u->manager)) {
 864                 r = 1;
 865                 goto restore_stdio;
 866         }
 867
 868         e = ellipsize(cmdline, 60, 100);
 869         if (!e) {
 870                 log_oom();
 871                 r = CONFIRM_EXECUTE;
 872                 goto restore_stdio;
 873         }
 874
 875         for (;;) {
 876                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 877                 if (r < 0) {
 878                         write_confirm_error_fd(r, STDOUT_FILENO, u);
 879                         r = CONFIRM_EXECUTE;
 880                         goto restore_stdio;
 881                 }
 882
 883                 switch (c) {
 884                 case 'c':
 885                         printf("Resuming normal execution.\n");
 886                         manager_disable_confirm_spawn();
 887                         r = 1;
 888                         break;
 889                 case 'D':
 890                         unit_dump(u, stdout, "  ");
 891                         continue; /* ask again */
 892                 case 'f':
 893                         printf("Failing execution.\n");
 894                         r = CONFIRM_PRETEND_FAILURE;
 895                         break;
 896                 case 'h':
 897                         printf("  c - continue, proceed without asking anymore\n"
 898                                "  D - dump, show the state of the unit\n"
 899                                "  f - fail, don't execute the command and pretend it failed\n"
 900                                "  h - help\n"
 901                                "  i - info, show a short summary of the unit\n"
 902                                "  j - jobs, show jobs that are in progress\n"
 903                                "  s - skip, don't execute the command and pretend it succeeded\n"
 904                                "  y - yes, execute the command\n");
 905                         continue; /* ask again */
 906                 case 'i':
 907                         printf("  Description: %s\n"
 908                                "  Unit:        %s\n"
 909                                "  Command:     %s\n",
 910                                u->id, u->description, cmdline);
 911                         continue; /* ask again */
 912                 case 'j':
 913                         manager_dump_jobs(u->manager, stdout, "  ");
 914                         continue; /* ask again */
 915                 case 'n':
 916                         /* 'n' was removed in favor of 'f'. */
 917                         printf("Didn't understand 'n', did you mean 'f'?\n");
 918                         continue; /* ask again */
 919                 case 's':
 920                         printf("Skipping execution.\n");
 921                         r = CONFIRM_PRETEND_SUCCESS;
 922                         break;
 923                 case 'y':
 924                         r = CONFIRM_EXECUTE;
 925                         break;
 926                 default:
 927                         assert_not_reached();
 928                 }
 929                 break;
 930         }
 931
 932 restore_stdio:
 933         restore_confirm_stdio(&saved_stdin, &saved_stdout);
 934         return r;
 935 }
 936
 937 static int get_fixed_user(const ExecContext *c, const char **user,
 938                           uid_t *uid, gid_t *gid,
 939                           const char **home, const char **shell) {
 940         int r;
 941         const char *name;
 942
 943         assert(c);
 944
 945         if (!c->user)
 946                 return 0;
 947
 948         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
 949          * (i.e. are "/" or "/bin/nologin"). */
 950
 951         name = c->user;
 952         r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
 953         if (r < 0)
 954                 return r;
 955
 956         *user = name;
 957         return 0;
 958 }
 959
 960 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
 961         int r;
 962         const char *name;
 963
 964         assert(c);
 965
 966         if (!c->group)
 967                 return 0;
 968
 969         name = c->group;
 970         r = get_group_creds(&name, gid, 0);
 971         if (r < 0)
 972                 return r;
 973
 974         *group = name;
 975         return 0;
 976 }
 977
 978 static int get_supplementary_groups(const ExecContext *c, const char *user,
 979                                     const char *group, gid_t gid,
 980                                     gid_t **supplementary_gids, int *ngids) {
 981         char **i;
 982         int r, k = 0;
 983         int ngroups_max;
 984         bool keep_groups = false;
 985         gid_t *groups = NULL;
 986         _cleanup_free_ gid_t *l_gids = NULL;
 987
 988         assert(c);
 989
 990         /*
 991          * If user is given, then lookup GID and supplementary groups list.
 992          * We avoid NSS lookups for gid=0. Also we have to initialize groups
 993          * here and as early as possible so we keep the list of supplementary
 994          * groups of the caller.
 995          */
 996         if (user && gid_is_valid(gid) && gid != 0) {
 997                 /* First step, initialize groups from /etc/groups */
 998                 if (initgroups(user, gid) < 0)
 999                         return -errno;
1000
1001                 keep_groups = true;
1002         }
1003
1004         if (strv_isempty(c->supplementary_groups))
1005                 return 0;
1006
1007         /*
1008          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1009          * be positive, otherwise fail.
1010          */
1011         errno = 0;
1012         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1013         if (ngroups_max <= 0)
1014                 return errno_or_else(EOPNOTSUPP);
1015
1016         l_gids = new(gid_t, ngroups_max);
1017         if (!l_gids)
1018                 return -ENOMEM;
1019
1020         if (keep_groups) {
1021                 /*
1022                  * Lookup the list of groups that the user belongs to, we
1023                  * avoid NSS lookups here too for gid=0.
1024                  */
1025                 k = ngroups_max;
1026                 if (getgrouplist(user, gid, l_gids, &k) < 0)
1027                         return -EINVAL;
1028         } else
1029                 k = 0;
1030
1031         STRV_FOREACH(i, c->supplementary_groups) {
1032                 const char *g;
1033
1034                 if (k >= ngroups_max)
1035                         return -E2BIG;
1036
1037                 g = *i;
1038                 r = get_group_creds(&g, l_gids+k, 0);
1039                 if (r < 0)
1040                         return r;
1041
1042                 k++;
1043         }
1044
1045         /*
1046          * Sets ngids to zero to drop all supplementary groups, happens
1047          * when we are under root and SupplementaryGroups= is empty.
1048          */
1049         if (k == 0) {
1050                 *ngids = 0;
1051                 return 0;
1052         }
1053
1054         /* Otherwise get the final list of supplementary groups */
1055         groups = memdup(l_gids, sizeof(gid_t) * k);
1056         if (!groups)
1057                 return -ENOMEM;
1058
1059         *supplementary_gids = groups;
1060         *ngids = k;
1061
1062         groups = NULL;
1063
1064         return 0;
1065 }
1066
1067 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1068         int r;
1069
1070         /* Handle SupplementaryGroups= if it is not empty */
1071         if (ngids > 0) {
1072                 r = maybe_setgroups(ngids, supplementary_gids);
1073                 if (r < 0)
1074                         return r;
1075         }
1076
1077         if (gid_is_valid(gid)) {
1078                 /* Then set our gids */
1079                 if (setresgid(gid, gid, gid) < 0)
1080                         return -errno;
1081         }
1082
1083         return 0;
1084 }
1085
1086 static int set_securebits(int bits, int mask) {
1087         int current, applied;
1088         current = prctl(PR_GET_SECUREBITS);
1089         if (current < 0)
1090                 return -errno;
1091         /* Clear all securebits defined in mask and set bits */
1092         applied = (current & ~mask) | bits;
1093         if (current == applied)
1094                 return 0;
1095         if (prctl(PR_SET_SECUREBITS, applied) < 0)
1096                 return -errno;
1097         return 1;
1098 }
1099
1100 static int enforce_user(const ExecContext *context, uid_t uid) {
1101         assert(context);
1102         int r;
1103
1104         if (!uid_is_valid(uid))
1105                 return 0;
1106
1107         /* Sets (but doesn't look up) the uid and make sure we keep the
1108          * capabilities while doing so. For setting secure bits the capability CAP_SETPCAP is
1109          * required, so we also need keep-caps in this case.
1110          */
1111
1112         if (context->capability_ambient_set != 0 || context->secure_bits != 0) {
1113
1114                 /* First step: If we need to keep capabilities but
1115                  * drop privileges we need to make sure we keep our
1116                  * caps, while we drop privileges. */
1117                 if (uid != 0) {
1118                         /* Add KEEP_CAPS to the securebits */
1119                         r = set_securebits(1<<SECURE_KEEP_CAPS, 0);
1120                         if (r < 0)
1121                                 return r;
1122                 }
1123         }
1124
1125         /* Second step: actually set the uids */
1126         if (setresuid(uid, uid, uid) < 0)
1127                 return -errno;
1128
1129         /* At this point we should have all necessary capabilities but
1130            are otherwise a normal user. However, the caps might got
1131            corrupted due to the setresuid() so we need clean them up
1132            later. This is done outside of this call. */
1133
1134         return 0;
1135 }
1136
1137 #if HAVE_PAM
1138
1139 static int null_conv(
1140                 int num_msg,
1141                 const struct pam_message **msg,
1142                 struct pam_response **resp,
1143                 void *appdata_ptr) {
1144
1145         /* We don't support conversations */
1146
1147         return PAM_CONV_ERR;
1148 }
1149
1150 #endif
1151
1152 static int setup_pam(
1153                 const char *name,
1154                 const char *user,
1155                 uid_t uid,
1156                 gid_t gid,
1157                 const char *tty,
1158                 char ***env,
1159                 const int fds[], size_t n_fds) {
1160
1161 #if HAVE_PAM
1162
1163         static const struct pam_conv conv = {
1164                 .conv = null_conv,
1165                 .appdata_ptr = NULL
1166         };
1167
1168         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1169         pam_handle_t *handle = NULL;
1170         sigset_t old_ss;
1171         int pam_code = PAM_SUCCESS, r;
1172         char **nv, **e = NULL;
1173         bool close_session = false;
1174         pid_t pam_pid = 0, parent_pid;
1175         int flags = 0;
1176
1177         assert(name);
1178         assert(user);
1179         assert(env);
1180
1181         /* We set up PAM in the parent process, then fork. The child
1182          * will then stay around until killed via PR_GET_PDEATHSIG or
1183          * systemd via the cgroup logic. It will then remove the PAM
1184          * session again. The parent process will exec() the actual
1185          * daemon. We do things this way to ensure that the main PID
1186          * of the daemon is the one we initially fork()ed. */
1187
1188         r = barrier_create(&barrier);
1189         if (r < 0)
1190                 goto fail;
1191
1192         if (log_get_max_level() < LOG_DEBUG)
1193                 flags |= PAM_SILENT;
1194
1195         pam_code = pam_start(name, user, &conv, &handle);
1196         if (pam_code != PAM_SUCCESS) {
1197                 handle = NULL;
1198                 goto fail;
1199         }
1200
1201         if (!tty) {
1202                 _cleanup_free_ char *q = NULL;
1203
1204                 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1205                  * out if that's the case, and read the TTY off it. */
1206
1207                 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1208                         tty = strjoina("/dev/", q);
1209         }
1210
1211         if (tty) {
1212                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1213                 if (pam_code != PAM_SUCCESS)
1214                         goto fail;
1215         }
1216
1217         STRV_FOREACH(nv, *env) {
1218                 pam_code = pam_putenv(handle, *nv);
1219                 if (pam_code != PAM_SUCCESS)
1220                         goto fail;
1221         }
1222
1223         pam_code = pam_acct_mgmt(handle, flags);
1224         if (pam_code != PAM_SUCCESS)
1225                 goto fail;
1226
1227         pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1228         if (pam_code != PAM_SUCCESS)
1229                 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
1230
1231         pam_code = pam_open_session(handle, flags);
1232         if (pam_code != PAM_SUCCESS)
1233                 goto fail;
1234
1235         close_session = true;
1236
1237         e = pam_getenvlist(handle);
1238         if (!e) {
1239                 pam_code = PAM_BUF_ERR;
1240                 goto fail;
1241         }
1242
1243         /* Block SIGTERM, so that we know that it won't get lost in
1244          * the child */
1245
1246         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1247
1248         parent_pid = getpid_cached();
1249
1250         r = safe_fork("(sd-pam)", 0, &pam_pid);
1251         if (r < 0)
1252                 goto fail;
1253         if (r == 0) {
1254                 int sig, ret = EXIT_PAM;
1255
1256                 /* The child's job is to reset the PAM session on
1257                  * termination */
1258                 barrier_set_role(&barrier, BARRIER_CHILD);
1259
1260                 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1261                  * those fds are open here that have been opened by PAM. */
1262                 (void) close_many(fds, n_fds);
1263
1264                 /* Drop privileges - we don't need any to pam_close_session
1265                  * and this will make PR_SET_PDEATHSIG work in most cases.
1266                  * If this fails, ignore the error - but expect sd-pam threads
1267                  * to fail to exit normally */
1268
1269                 r = maybe_setgroups(0, NULL);
1270                 if (r < 0)
1271                         log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1272                 if (setresgid(gid, gid, gid) < 0)
1273                         log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1274                 if (setresuid(uid, uid, uid) < 0)
1275                         log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1276
1277                 (void) ignore_signals(SIGPIPE);
1278
1279                 /* Wait until our parent died. This will only work if
1280                  * the above setresuid() succeeds, otherwise the kernel
1281                  * will not allow unprivileged parents kill their privileged
1282                  * children this way. We rely on the control groups kill logic
1283                  * to do the rest for us. */
1284                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1285                         goto child_finish;
1286
1287                 /* Tell the parent that our setup is done. This is especially
1288                  * important regarding dropping privileges. Otherwise, unit
1289                  * setup might race against our setresuid(2) call.
1290                  *
1291                  * If the parent aborted, we'll detect this below, hence ignore
1292                  * return failure here. */
1293                 (void) barrier_place(&barrier);
1294
1295                 /* Check if our parent process might already have died? */
1296                 if (getppid() == parent_pid) {
1297                         sigset_t ss;
1298
1299                         assert_se(sigemptyset(&ss) >= 0);
1300                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1301
1302                         for (;;) {
1303                                 if (sigwait(&ss, &sig) < 0) {
1304                                         if (errno == EINTR)
1305                                                 continue;
1306
1307                                         goto child_finish;
1308                                 }
1309
1310                                 assert(sig == SIGTERM);
1311                                 break;
1312                         }
1313                 }
1314
1315                 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1316                 if (pam_code != PAM_SUCCESS)
1317                         goto child_finish;
1318
1319                 /* If our parent died we'll end the session */
1320                 if (getppid() != parent_pid) {
1321                         pam_code = pam_close_session(handle, flags);
1322                         if (pam_code != PAM_SUCCESS)
1323                                 goto child_finish;
1324                 }
1325
1326                 ret = 0;
1327
1328         child_finish:
1329                 pam_end(handle, pam_code | flags);
1330                 _exit(ret);
1331         }
1332
1333         barrier_set_role(&barrier, BARRIER_PARENT);
1334
1335         /* If the child was forked off successfully it will do all the
1336          * cleanups, so forget about the handle here. */
1337         handle = NULL;
1338
1339         /* Unblock SIGTERM again in the parent */
1340         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1341
1342         /* We close the log explicitly here, since the PAM modules
1343          * might have opened it, but we don't want this fd around. */
1344         closelog();
1345
1346         /* Synchronously wait for the child to initialize. We don't care for
1347          * errors as we cannot recover. However, warn loudly if it happens. */
1348         if (!barrier_place_and_sync(&barrier))
1349                 log_error("PAM initialization failed");
1350
1351         return strv_free_and_replace(*env, e);
1352
1353 fail:
1354         if (pam_code != PAM_SUCCESS) {
1355                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1356                 r = -EPERM;  /* PAM errors do not map to errno */
1357         } else
1358                 log_error_errno(r, "PAM failed: %m");
1359
1360         if (handle) {
1361                 if (close_session)
1362                         pam_code = pam_close_session(handle, flags);
1363
1364                 pam_end(handle, pam_code | flags);
1365         }
1366
1367         strv_free(e);
1368         closelog();
1369
1370         return r;
1371 #else
1372         return 0;
1373 #endif
1374 }
1375
1376 static void rename_process_from_path(const char *path) {
1377         char process_name[11];
1378         const char *p;
1379         size_t l;
1380
1381         /* This resulting string must fit in 10 chars (i.e. the length
1382          * of "/sbin/init") to look pretty in /bin/ps */
1383
1384         p = basename(path);
1385         if (isempty(p)) {
1386                 rename_process("(...)");
1387                 return;
1388         }
1389
1390         l = strlen(p);
1391         if (l > 8) {
1392                 /* The end of the process name is usually more
1393                  * interesting, since the first bit might just be
1394                  * "systemd-" */
1395                 p = p + l - 8;
1396                 l = 8;
1397         }
1398
1399         process_name[0] = '(';
1400         memcpy(process_name+1, p, l);
1401         process_name[1+l] = ')';
1402         process_name[1+l+1] = 0;
1403
1404         rename_process(process_name);
1405 }
1406
1407 static bool context_has_address_families(const ExecContext *c) {
1408         assert(c);
1409
1410         return c->address_families_allow_list ||
1411                 !set_isempty(c->address_families);
1412 }
1413
1414 static bool context_has_syscall_filters(const ExecContext *c) {
1415         assert(c);
1416
1417         return c->syscall_allow_list ||
1418                 !hashmap_isempty(c->syscall_filter);
1419 }
1420
1421 static bool context_has_syscall_logs(const ExecContext *c) {
1422         assert(c);
1423
1424         return c->syscall_log_allow_list ||
1425                 !hashmap_isempty(c->syscall_log);
1426 }
1427
1428 static bool context_has_no_new_privileges(const ExecContext *c) {
1429         assert(c);
1430
1431         if (c->no_new_privileges)
1432                 return true;
1433
1434         if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1435                 return false;
1436
1437         /* We need NNP if we have any form of seccomp and are unprivileged */
1438         return c->lock_personality ||
1439                 c->memory_deny_write_execute ||
1440                 c->private_devices ||
1441                 c->protect_clock ||
1442                 c->protect_hostname ||
1443                 c->protect_kernel_tunables ||
1444                 c->protect_kernel_modules ||
1445                 c->protect_kernel_logs ||
1446                 context_has_address_families(c) ||
1447                 exec_context_restrict_namespaces_set(c) ||
1448                 c->restrict_realtime ||
1449                 c->restrict_suid_sgid ||
1450                 !set_isempty(c->syscall_archs) ||
1451                 context_has_syscall_filters(c) ||
1452                 context_has_syscall_logs(c);
1453 }
1454
1455 static bool exec_context_has_credentials(const ExecContext *context) {
1456
1457         assert(context);
1458
1459         return !hashmap_isempty(context->set_credentials) ||
1460                 !hashmap_isempty(context->load_credentials);
1461 }
1462
1463 #if HAVE_SECCOMP
1464
1465 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1466
1467         if (is_seccomp_available())
1468                 return false;
1469
1470         log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1471         return true;
1472 }
1473
1474 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1475         uint32_t negative_action, default_action, action;
1476         int r;
1477
1478         assert(u);
1479         assert(c);
1480
1481         if (!context_has_syscall_filters(c))
1482                 return 0;
1483
1484         if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1485                 return 0;
1486
1487         negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1488
1489         if (c->syscall_allow_list) {
1490                 default_action = negative_action;
1491                 action = SCMP_ACT_ALLOW;
1492         } else {
1493                 default_action = SCMP_ACT_ALLOW;
1494                 action = negative_action;
1495         }
1496
1497         if (needs_ambient_hack) {
1498                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1499                 if (r < 0)
1500                         return r;
1501         }
1502
1503         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1504 }
1505
1506 static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1507 #ifdef SCMP_ACT_LOG
1508         uint32_t default_action, action;
1509 #endif
1510
1511         assert(u);
1512         assert(c);
1513
1514         if (!context_has_syscall_logs(c))
1515                 return 0;
1516
1517 #ifdef SCMP_ACT_LOG
1518         if (skip_seccomp_unavailable(u, "SystemCallLog="))
1519                 return 0;
1520
1521         if (c->syscall_log_allow_list) {
1522                 /* Log nothing but the ones listed */
1523                 default_action = SCMP_ACT_ALLOW;
1524                 action = SCMP_ACT_LOG;
1525         } else {
1526                 /* Log everything but the ones listed */
1527                 default_action = SCMP_ACT_LOG;
1528                 action = SCMP_ACT_ALLOW;
1529         }
1530
1531         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1532 #else
1533         /* old libseccomp */
1534         log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1535         return 0;
1536 #endif
1537 }
1538
1539 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1540         assert(u);
1541         assert(c);
1542
1543         if (set_isempty(c->syscall_archs))
1544                 return 0;
1545
1546         if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1547                 return 0;
1548
1549         return seccomp_restrict_archs(c->syscall_archs);
1550 }
1551
1552 static int apply_address_families(const Unit* u, const ExecContext *c) {
1553         assert(u);
1554         assert(c);
1555
1556         if (!context_has_address_families(c))
1557                 return 0;
1558
1559         if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1560                 return 0;
1561
1562         return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1563 }
1564
1565 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1566         assert(u);
1567         assert(c);
1568
1569         if (!c->memory_deny_write_execute)
1570                 return 0;
1571
1572         if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1573                 return 0;
1574
1575         return seccomp_memory_deny_write_execute();
1576 }
1577
1578 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1579         assert(u);
1580         assert(c);
1581
1582         if (!c->restrict_realtime)
1583                 return 0;
1584
1585         if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1586                 return 0;
1587
1588         return seccomp_restrict_realtime();
1589 }
1590
1591 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1592         assert(u);
1593         assert(c);
1594
1595         if (!c->restrict_suid_sgid)
1596                 return 0;
1597
1598         if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1599                 return 0;
1600
1601         return seccomp_restrict_suid_sgid();
1602 }
1603
1604 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1605         assert(u);
1606         assert(c);
1607
1608         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1609          * let's protect even those systems where this is left on in the kernel. */
1610
1611         if (!c->protect_kernel_tunables)
1612                 return 0;
1613
1614         if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1615                 return 0;
1616
1617         return seccomp_protect_sysctl();
1618 }
1619
1620 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1621         assert(u);
1622         assert(c);
1623
1624         /* Turn off module syscalls on ProtectKernelModules=yes */
1625
1626         if (!c->protect_kernel_modules)
1627                 return 0;
1628
1629         if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1630                 return 0;
1631
1632         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1633 }
1634
1635 static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1636         assert(u);
1637         assert(c);
1638
1639         if (!c->protect_kernel_logs)
1640                 return 0;
1641
1642         if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1643                 return 0;
1644
1645         return seccomp_protect_syslog();
1646 }
1647
1648 static int apply_protect_clock(const Unit *u, const ExecContext *c) {
1649         assert(u);
1650         assert(c);
1651
1652         if (!c->protect_clock)
1653                 return 0;
1654
1655         if (skip_seccomp_unavailable(u, "ProtectClock="))
1656                 return 0;
1657
1658         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1659 }
1660
1661 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1662         assert(u);
1663         assert(c);
1664
1665         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1666
1667         if (!c->private_devices)
1668                 return 0;
1669
1670         if (skip_seccomp_unavailable(u, "PrivateDevices="))
1671                 return 0;
1672
1673         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1674 }
1675
1676 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1677         assert(u);
1678         assert(c);
1679
1680         if (!exec_context_restrict_namespaces_set(c))
1681                 return 0;
1682
1683         if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1684                 return 0;
1685
1686         return seccomp_restrict_namespaces(c->restrict_namespaces);
1687 }
1688
1689 #if HAVE_LIBBPF
1690 static bool skip_lsm_bpf_unsupported(const Unit* u, const char* msg) {
1691         if (lsm_bpf_supported())
1692                 return false;
1693
1694         log_unit_debug(u, "LSM BPF not supported, skipping %s", msg);
1695         return true;
1696 }
1697
1698 static int apply_restrict_filesystems(Unit *u, const ExecContext *c) {
1699         assert(u);
1700         assert(c);
1701
1702         if (!exec_context_restrict_filesystems_set(c))
1703                 return 0;
1704
1705         if (skip_lsm_bpf_unsupported(u, "RestrictFileSystems="))
1706                 return 0;
1707
1708         return lsm_bpf_unit_restrict_filesystems(u, c->restrict_filesystems, c->restrict_filesystems_allow_list);
1709 }
1710 #endif
1711
1712 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1713         unsigned long personality;
1714         int r;
1715
1716         assert(u);
1717         assert(c);
1718
1719         if (!c->lock_personality)
1720                 return 0;
1721
1722         if (skip_seccomp_unavailable(u, "LockPersonality="))
1723                 return 0;
1724
1725         personality = c->personality;
1726
1727         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1728         if (personality == PERSONALITY_INVALID) {
1729
1730                 r = opinionated_personality(&personality);
1731                 if (r < 0)
1732                         return r;
1733         }
1734
1735         return seccomp_lock_personality(personality);
1736 }
1737
1738 #endif
1739
1740 static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
1741         assert(u);
1742         assert(c);
1743
1744         if (!c->protect_hostname)
1745                 return 0;
1746
1747         if (ns_type_supported(NAMESPACE_UTS)) {
1748                 if (unshare(CLONE_NEWUTS) < 0) {
1749                         if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1750                                 *ret_exit_status = EXIT_NAMESPACE;
1751                                 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1752                         }
1753
1754                         log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1755                 }
1756         } else
1757                 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1758
1759 #if HAVE_SECCOMP
1760         int r;
1761
1762         if (skip_seccomp_unavailable(u, "ProtectHostname="))
1763                 return 0;
1764
1765         r = seccomp_protect_hostname();
1766         if (r < 0) {
1767                 *ret_exit_status = EXIT_SECCOMP;
1768                 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1769         }
1770 #endif
1771
1772         return 0;
1773 }
1774
1775 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1776         assert(idle_pipe);
1777
1778         idle_pipe[1] = safe_close(idle_pipe[1]);
1779         idle_pipe[2] = safe_close(idle_pipe[2]);
1780
1781         if (idle_pipe[0] >= 0) {
1782                 int r;
1783
1784                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1785
1786                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1787                         ssize_t n;
1788
1789                         /* Signal systemd that we are bored and want to continue. */
1790                         n = write(idle_pipe[3], "x", 1);
1791                         if (n > 0)
1792                                 /* Wait for systemd to react to the signal above. */
1793                                 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1794                 }
1795
1796                 idle_pipe[0] = safe_close(idle_pipe[0]);
1797
1798         }
1799
1800         idle_pipe[3] = safe_close(idle_pipe[3]);
1801 }
1802
1803 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1804
1805 static int build_environment(
1806                 const Unit *u,
1807                 const ExecContext *c,
1808                 const ExecParameters *p,
1809                 size_t n_fds,
1810                 const char *home,
1811                 const char *username,
1812                 const char *shell,
1813                 dev_t journal_stream_dev,
1814                 ino_t journal_stream_ino,
1815                 char ***ret) {
1816
1817         _cleanup_strv_free_ char **our_env = NULL;
1818         size_t n_env = 0;
1819         char *x;
1820
1821         assert(u);
1822         assert(c);
1823         assert(p);
1824         assert(ret);
1825
1826 #define N_ENV_VARS 17
1827         our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1828         if (!our_env)
1829                 return -ENOMEM;
1830
1831         if (n_fds > 0) {
1832                 _cleanup_free_ char *joined = NULL;
1833
1834                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1835                         return -ENOMEM;
1836                 our_env[n_env++] = x;
1837
1838                 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1839                         return -ENOMEM;
1840                 our_env[n_env++] = x;
1841
1842                 joined = strv_join(p->fd_names, ":");
1843                 if (!joined)
1844                         return -ENOMEM;
1845
1846                 x = strjoin("LISTEN_FDNAMES=", joined);
1847                 if (!x)
1848                         return -ENOMEM;
1849                 our_env[n_env++] = x;
1850         }
1851
1852         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1853                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1854                         return -ENOMEM;
1855                 our_env[n_env++] = x;
1856
1857                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1858                         return -ENOMEM;
1859                 our_env[n_env++] = x;
1860         }
1861
1862         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1863          * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1864          * check the database directly. */
1865         if (p->flags & EXEC_NSS_BYPASS_BUS) {
1866                 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1867                 if (!x)
1868                         return -ENOMEM;
1869                 our_env[n_env++] = x;
1870         }
1871
1872         if (home) {
1873                 x = strjoin("HOME=", home);
1874                 if (!x)
1875                         return -ENOMEM;
1876
1877                 path_simplify(x + 5);
1878                 our_env[n_env++] = x;
1879         }
1880
1881         if (username) {
1882                 x = strjoin("LOGNAME=", username);
1883                 if (!x)
1884                         return -ENOMEM;
1885                 our_env[n_env++] = x;
1886
1887                 x = strjoin("USER=", username);
1888                 if (!x)
1889                         return -ENOMEM;
1890                 our_env[n_env++] = x;
1891         }
1892
1893         if (shell) {
1894                 x = strjoin("SHELL=", shell);
1895                 if (!x)
1896                         return -ENOMEM;
1897
1898                 path_simplify(x + 6);
1899                 our_env[n_env++] = x;
1900         }
1901
1902         if (!sd_id128_is_null(u->invocation_id)) {
1903                 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1904                         return -ENOMEM;
1905
1906                 our_env[n_env++] = x;
1907         }
1908
1909         if (exec_context_needs_term(c)) {
1910                 const char *tty_path, *term = NULL;
1911
1912                 tty_path = exec_context_tty_path(c);
1913
1914                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1915                  * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1916                  * container manager passes to PID 1 ends up all the way in the console login shown. */
1917
1918                 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
1919                         term = getenv("TERM");
1920
1921                 if (!term)
1922                         term = default_term_for_tty(tty_path);
1923
1924                 x = strjoin("TERM=", term);
1925                 if (!x)
1926                         return -ENOMEM;
1927                 our_env[n_env++] = x;
1928         }
1929
1930         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1931                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1932                         return -ENOMEM;
1933
1934                 our_env[n_env++] = x;
1935         }
1936
1937         if (c->log_namespace) {
1938                 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1939                 if (!x)
1940                         return -ENOMEM;
1941
1942                 our_env[n_env++] = x;
1943         }
1944
1945         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1946                 _cleanup_free_ char *joined = NULL;
1947                 const char *n;
1948
1949                 if (!p->prefix[t])
1950                         continue;
1951
1952                 if (c->directories[t].n_items == 0)
1953                         continue;
1954
1955                 n = exec_directory_env_name_to_string(t);
1956                 if (!n)
1957                         continue;
1958
1959                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
1960                         _cleanup_free_ char *prefixed = NULL;
1961
1962                         prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
1963                         if (!prefixed)
1964                                 return -ENOMEM;
1965
1966                         if (!strextend_with_separator(&joined, ":", prefixed))
1967                                 return -ENOMEM;
1968                 }
1969
1970                 x = strjoin(n, "=", joined);
1971                 if (!x)
1972                         return -ENOMEM;
1973
1974                 our_env[n_env++] = x;
1975         }
1976
1977         if (exec_context_has_credentials(c) && p->prefix[EXEC_DIRECTORY_RUNTIME]) {
1978                 x = strjoin("CREDENTIALS_DIRECTORY=", p->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
1979                 if (!x)
1980                         return -ENOMEM;
1981
1982                 our_env[n_env++] = x;
1983         }
1984
1985         if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
1986                 return -ENOMEM;
1987
1988         our_env[n_env++] = x;
1989
1990         our_env[n_env++] = NULL;
1991         assert(n_env <= N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1992 #undef N_ENV_VARS
1993
1994         *ret = TAKE_PTR(our_env);
1995
1996         return 0;
1997 }
1998
1999 static int build_pass_environment(const ExecContext *c, char ***ret) {
2000         _cleanup_strv_free_ char **pass_env = NULL;
2001         size_t n_env = 0;
2002         char **i;
2003
2004         STRV_FOREACH(i, c->pass_environment) {
2005                 _cleanup_free_ char *x = NULL;
2006                 char *v;
2007
2008                 v = getenv(*i);
2009                 if (!v)
2010                         continue;
2011                 x = strjoin(*i, "=", v);
2012                 if (!x)
2013                         return -ENOMEM;
2014
2015                 if (!GREEDY_REALLOC(pass_env, n_env + 2))
2016                         return -ENOMEM;
2017
2018                 pass_env[n_env++] = TAKE_PTR(x);
2019                 pass_env[n_env] = NULL;
2020         }
2021
2022         *ret = TAKE_PTR(pass_env);
2023
2024         return 0;
2025 }
2026
2027 bool exec_needs_mount_namespace(
2028                 const ExecContext *context,
2029                 const ExecParameters *params,
2030                 const ExecRuntime *runtime) {
2031
2032         assert(context);
2033
2034         if (context->root_image)
2035                 return true;
2036
2037         if (!strv_isempty(context->read_write_paths) ||
2038             !strv_isempty(context->read_only_paths) ||
2039             !strv_isempty(context->inaccessible_paths) ||
2040             !strv_isempty(context->exec_paths) ||
2041             !strv_isempty(context->no_exec_paths))
2042                 return true;
2043
2044         if (context->n_bind_mounts > 0)
2045                 return true;
2046
2047         if (context->n_temporary_filesystems > 0)
2048                 return true;
2049
2050         if (context->n_mount_images > 0)
2051                 return true;
2052
2053         if (context->n_extension_images > 0)
2054                 return true;
2055
2056         if (!IN_SET(context->mount_flags, 0, MS_SHARED))
2057                 return true;
2058
2059         if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
2060                 return true;
2061
2062         if (context->private_devices ||
2063             context->private_mounts ||
2064             context->protect_system != PROTECT_SYSTEM_NO ||
2065             context->protect_home != PROTECT_HOME_NO ||
2066             context->protect_kernel_tunables ||
2067             context->protect_kernel_modules ||
2068             context->protect_kernel_logs ||
2069             context->protect_control_groups ||
2070             context->protect_proc != PROTECT_PROC_DEFAULT ||
2071             context->proc_subset != PROC_SUBSET_ALL ||
2072             context->private_ipc ||
2073             context->ipc_namespace_path)
2074                 return true;
2075
2076         if (context->root_directory) {
2077                 if (exec_context_get_effective_mount_apivfs(context))
2078                         return true;
2079
2080                 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2081                         if (params && !params->prefix[t])
2082                                 continue;
2083
2084                         if (context->directories[t].n_items > 0)
2085                                 return true;
2086                 }
2087         }
2088
2089         if (context->dynamic_user &&
2090             (context->directories[EXEC_DIRECTORY_STATE].n_items > 0 ||
2091              context->directories[EXEC_DIRECTORY_CACHE].n_items > 0 ||
2092              context->directories[EXEC_DIRECTORY_LOGS].n_items > 0))
2093                 return true;
2094
2095         if (context->log_namespace)
2096                 return true;
2097
2098         return false;
2099 }
2100
2101 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
2102         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2103         _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
2104         _cleanup_close_ int unshare_ready_fd = -1;
2105         _cleanup_(sigkill_waitp) pid_t pid = 0;
2106         uint64_t c = 1;
2107         ssize_t n;
2108         int r;
2109
2110         /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2111          * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2112          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2113          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2114          * which waits for the parent to create the new user namespace while staying in the original namespace. The
2115          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2116          * continues execution normally.
2117          * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2118          * does not need CAP_SETUID to write the single line mapping to itself. */
2119
2120         /* Can only set up multiple mappings with CAP_SETUID. */
2121         if (have_effective_cap(CAP_SETUID) && uid != ouid && uid_is_valid(uid))
2122                 r = asprintf(&uid_map,
2123                              UID_FMT " " UID_FMT " 1\n"     /* Map $OUID → $OUID */
2124                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
2125                              ouid, ouid, uid, uid);
2126         else
2127                 r = asprintf(&uid_map,
2128                              UID_FMT " " UID_FMT " 1\n",    /* Map $OUID → $OUID */
2129                              ouid, ouid);
2130
2131         if (r < 0)
2132                 return -ENOMEM;
2133
2134         /* Can only set up multiple mappings with CAP_SETGID. */
2135         if (have_effective_cap(CAP_SETGID) && gid != ogid && gid_is_valid(gid))
2136                 r = asprintf(&gid_map,
2137                              GID_FMT " " GID_FMT " 1\n"     /* Map $OGID → $OGID */
2138                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
2139                              ogid, ogid, gid, gid);
2140         else
2141                 r = asprintf(&gid_map,
2142                              GID_FMT " " GID_FMT " 1\n",    /* Map $OGID -> $OGID */
2143                              ogid, ogid);
2144
2145         if (r < 0)
2146                 return -ENOMEM;
2147
2148         /* Create a communication channel so that the parent can tell the child when it finished creating the user
2149          * namespace. */
2150         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2151         if (unshare_ready_fd < 0)
2152                 return -errno;
2153
2154         /* Create a communication channel so that the child can tell the parent a proper error code in case it
2155          * failed. */
2156         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2157                 return -errno;
2158
2159         r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2160         if (r < 0)
2161                 return r;
2162         if (r == 0) {
2163                 _cleanup_close_ int fd = -1;
2164                 const char *a;
2165                 pid_t ppid;
2166
2167                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2168                  * here, after the parent opened its own user namespace. */
2169
2170                 ppid = getppid();
2171                 errno_pipe[0] = safe_close(errno_pipe[0]);
2172
2173                 /* Wait until the parent unshared the user namespace */
2174                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2175                         r = -errno;
2176                         goto child_fail;
2177                 }
2178
2179                 /* Disable the setgroups() system call in the child user namespace, for good. */
2180                 a = procfs_file_alloca(ppid, "setgroups");
2181                 fd = open(a, O_WRONLY|O_CLOEXEC);
2182                 if (fd < 0) {
2183                         if (errno != ENOENT) {
2184                                 r = -errno;
2185                                 goto child_fail;
2186                         }
2187
2188                         /* If the file is missing the kernel is too old, let's continue anyway. */
2189                 } else {
2190                         if (write(fd, "deny\n", 5) < 0) {
2191                                 r = -errno;
2192                                 goto child_fail;
2193                         }
2194
2195                         fd = safe_close(fd);
2196                 }
2197
2198                 /* First write the GID map */
2199                 a = procfs_file_alloca(ppid, "gid_map");
2200                 fd = open(a, O_WRONLY|O_CLOEXEC);
2201                 if (fd < 0) {
2202                         r = -errno;
2203                         goto child_fail;
2204                 }
2205                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2206                         r = -errno;
2207                         goto child_fail;
2208                 }
2209                 fd = safe_close(fd);
2210
2211                 /* The write the UID map */
2212                 a = procfs_file_alloca(ppid, "uid_map");
2213                 fd = open(a, O_WRONLY|O_CLOEXEC);
2214                 if (fd < 0) {
2215                         r = -errno;
2216                         goto child_fail;
2217                 }
2218                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2219                         r = -errno;
2220                         goto child_fail;
2221                 }
2222
2223                 _exit(EXIT_SUCCESS);
2224
2225         child_fail:
2226                 (void) write(errno_pipe[1], &r, sizeof(r));
2227                 _exit(EXIT_FAILURE);
2228         }
2229
2230         errno_pipe[1] = safe_close(errno_pipe[1]);
2231
2232         if (unshare(CLONE_NEWUSER) < 0)
2233                 return -errno;
2234
2235         /* Let the child know that the namespace is ready now */
2236         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2237                 return -errno;
2238
2239         /* Try to read an error code from the child */
2240         n = read(errno_pipe[0], &r, sizeof(r));
2241         if (n < 0)
2242                 return -errno;
2243         if (n == sizeof(r)) { /* an error code was sent to us */
2244                 if (r < 0)
2245                         return r;
2246                 return -EIO;
2247         }
2248         if (n != 0) /* on success we should have read 0 bytes */
2249                 return -EIO;
2250
2251         r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
2252         pid = 0;
2253         if (r < 0)
2254                 return r;
2255         if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2256                 return -EIO;
2257
2258         return 0;
2259 }
2260
2261 static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2262         if (!context->dynamic_user)
2263                 return false;
2264
2265         if (type == EXEC_DIRECTORY_CONFIGURATION)
2266                 return false;
2267
2268         if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2269                 return false;
2270
2271         return true;
2272 }
2273
2274 static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2275         _cleanup_free_ char *src_abs = NULL;
2276         char **dst;
2277         int r;
2278
2279         assert(source);
2280
2281         src_abs = path_join(root, source);
2282         if (!src_abs)
2283                 return -ENOMEM;
2284
2285         STRV_FOREACH(dst, symlinks) {
2286                 _cleanup_free_ char *dst_abs = NULL;
2287
2288                 dst_abs = path_join(root, *dst);
2289                 if (!dst_abs)
2290                         return -ENOMEM;
2291
2292                 r = mkdir_parents_label(dst_abs, 0755);
2293                 if (r < 0)
2294                         return r;
2295
2296                 r = symlink_idempotent(src_abs, dst_abs, true);
2297                 if (r < 0)
2298                         return r;
2299         }
2300
2301         return 0;
2302 }
2303
2304 static int setup_exec_directory(
2305                 const ExecContext *context,
2306                 const ExecParameters *params,
2307                 uid_t uid,
2308                 gid_t gid,
2309                 ExecDirectoryType type,
2310                 bool needs_mount_namespace,
2311                 int *exit_status) {
2312
2313         static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2314                 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2315                 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2316                 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2317                 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2318                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2319         };
2320         int r;
2321
2322         assert(context);
2323         assert(params);
2324         assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2325         assert(exit_status);
2326
2327         if (!params->prefix[type])
2328                 return 0;
2329
2330         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2331                 if (!uid_is_valid(uid))
2332                         uid = 0;
2333                 if (!gid_is_valid(gid))
2334                         gid = 0;
2335         }
2336
2337         for (size_t i = 0; i < context->directories[type].n_items; i++) {
2338                 _cleanup_free_ char *p = NULL, *pp = NULL;
2339
2340                 p = path_join(params->prefix[type], context->directories[type].items[i].path);
2341                 if (!p) {
2342                         r = -ENOMEM;
2343                         goto fail;
2344                 }
2345
2346                 r = mkdir_parents_label(p, 0755);
2347                 if (r < 0)
2348                         goto fail;
2349
2350                 if (exec_directory_is_private(context, type)) {
2351                         /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2352                          * case we want to avoid leaving a directory around fully accessible that is owned by
2353                          * a dynamic user whose UID is later on reused. To lock this down we use the same
2354                          * trick used by container managers to prohibit host users to get access to files of
2355                          * the same UID in containers: we place everything inside a directory that has an
2356                          * access mode of 0700 and is owned root:root, so that it acts as security boundary
2357                          * for unprivileged host code. We then use fs namespacing to make this directory
2358                          * permeable for the service itself.
2359                          *
2360                          * Specifically: for a service which wants a special directory "foo/" we first create
2361                          * a directory "private/" with access mode 0700 owned by root:root. Then we place
2362                          * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2363                          * "private/foo". This way, privileged host users can access "foo/" as usual, but
2364                          * unprivileged host users can't look into it. Inside of the namespace of the unit
2365                          * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2366                          * "private/foo/" is mounted under the same name, thus disabling the access boundary
2367                          * for the service and making sure it only gets access to the dirs it needs but no
2368                          * others. Tricky? Yes, absolutely, but it works!
2369                          *
2370                          * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2371                          * to be owned by the service itself.
2372                          *
2373                          * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2374                          * for sharing files or sockets with other services. */
2375
2376                         pp = path_join(params->prefix[type], "private");
2377                         if (!pp) {
2378                                 r = -ENOMEM;
2379                                 goto fail;
2380                         }
2381
2382                         /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2383                         r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
2384                         if (r < 0)
2385                                 goto fail;
2386
2387                         if (!path_extend(&pp, context->directories[type].items[i].path)) {
2388                                 r = -ENOMEM;
2389                                 goto fail;
2390                         }
2391
2392                         /* Create all directories between the configured directory and this private root, and mark them 0755 */
2393                         r = mkdir_parents_label(pp, 0755);
2394                         if (r < 0)
2395                                 goto fail;
2396
2397                         if (is_dir(p, false) > 0 &&
2398                             (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2399
2400                                 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2401                                  * it over. Most likely the service has been upgraded from one that didn't use
2402                                  * DynamicUser=1, to one that does. */
2403
2404                                 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2405                                          "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2406                                          exec_directory_type_to_string(type), p, pp);
2407
2408                                 if (rename(p, pp) < 0) {
2409                                         r = -errno;
2410                                         goto fail;
2411                                 }
2412                         } else {
2413                                 /* Otherwise, create the actual directory for the service */
2414
2415                                 r = mkdir_label(pp, context->directories[type].mode);
2416                                 if (r < 0 && r != -EEXIST)
2417                                         goto fail;
2418                         }
2419
2420                         /* And link it up from the original place. Note that if a mount namespace is going to be
2421                          * used, then this symlink remains on the host, and a new one for the child namespace will
2422                          * be created later. */
2423                         r = symlink_idempotent(pp, p, true);
2424                         if (r < 0)
2425                                 goto fail;
2426
2427                 } else {
2428                         _cleanup_free_ char *target = NULL;
2429
2430                         if (type != EXEC_DIRECTORY_CONFIGURATION &&
2431                             readlink_and_make_absolute(p, &target) >= 0) {
2432                                 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2433
2434                                 /* This already exists and is a symlink? Interesting. Maybe it's one created
2435                                  * by DynamicUser=1 (see above)?
2436                                  *
2437                                  * We do this for all directory types except for ConfigurationDirectory=,
2438                                  * since they all support the private/ symlink logic at least in some
2439                                  * configurations, see above. */
2440
2441                                 r = chase_symlinks(target, NULL, 0, &target_resolved, NULL);
2442                                 if (r < 0)
2443                                         goto fail;
2444
2445                                 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
2446                                 if (!q) {
2447                                         r = -ENOMEM;
2448                                         goto fail;
2449                                 }
2450
2451                                 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2452                                 r = chase_symlinks(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2453                                 if (r < 0)
2454                                         goto fail;
2455
2456                                 if (path_equal(q_resolved, target_resolved)) {
2457
2458                                         /* Hmm, apparently DynamicUser= was once turned on for this service,
2459                                          * but is no longer. Let's move the directory back up. */
2460
2461                                         log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2462                                                  "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2463                                                  exec_directory_type_to_string(type), q, p);
2464
2465                                         if (unlink(p) < 0) {
2466                                                 r = -errno;
2467                                                 goto fail;
2468                                         }
2469
2470                                         if (rename(q, p) < 0) {
2471                                                 r = -errno;
2472                                                 goto fail;
2473                                         }
2474                                 }
2475                         }
2476
2477                         r = mkdir_label(p, context->directories[type].mode);
2478                         if (r < 0) {
2479                                 if (r != -EEXIST)
2480                                         goto fail;
2481
2482                                 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2483                                         struct stat st;
2484
2485                                         /* Don't change the owner/access mode of the configuration directory,
2486                                          * as in the common case it is not written to by a service, and shall
2487                                          * not be writable. */
2488
2489                                         if (stat(p, &st) < 0) {
2490                                                 r = -errno;
2491                                                 goto fail;
2492                                         }
2493
2494                                         /* Still complain if the access mode doesn't match */
2495                                         if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2496                                                 log_warning("%s \'%s\' already exists but the mode is different. "
2497                                                             "(File system: %o %sMode: %o)",
2498                                                             exec_directory_type_to_string(type), context->directories[type].items[i].path,
2499                                                             st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2500
2501                                         continue;
2502                                 }
2503                         }
2504                 }
2505
2506                 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2507                  * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2508                  * current UID/GID ownership.) */
2509                 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2510                 if (r < 0)
2511                         goto fail;
2512
2513                 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2514                  * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2515                  * assignments to exist. */
2516                 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
2517                 if (r < 0)
2518                         goto fail;
2519         }
2520
2521         /* If we are not going to run in a namespace, set up the symlinks - otherwise
2522          * they are set up later, to allow configuring empty var/run/etc. */
2523         if (!needs_mount_namespace)
2524                 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2525                         r = create_many_symlinks(params->prefix[type],
2526                                                  context->directories[type].items[i].path,
2527                                                  context->directories[type].items[i].symlinks);
2528                         if (r < 0)
2529                                 goto fail;
2530                 }
2531
2532         return 0;
2533
2534 fail:
2535         *exit_status = exit_status_table[type];
2536         return r;
2537 }
2538
2539 static int write_credential(
2540                 int dfd,
2541                 const char *id,
2542                 const void *data,
2543                 size_t size,
2544                 uid_t uid,
2545                 bool ownership_ok) {
2546
2547         _cleanup_(unlink_and_freep) char *tmp = NULL;
2548         _cleanup_close_ int fd = -1;
2549         int r;
2550
2551         r = tempfn_random_child("", "cred", &tmp);
2552         if (r < 0)
2553                 return r;
2554
2555         fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
2556         if (fd < 0) {
2557                 tmp = mfree(tmp);
2558                 return -errno;
2559         }
2560
2561         r = loop_write(fd, data, size, /* do_poll = */ false);
2562         if (r < 0)
2563                 return r;
2564
2565         if (fchmod(fd, 0400) < 0) /* Take away "w" bit */
2566                 return -errno;
2567
2568         if (uid_is_valid(uid) && uid != getuid()) {
2569                 r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
2570                 if (r < 0) {
2571                         if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2572                                 return r;
2573
2574                         if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want
2575                                             * to express: that the user gets read access and nothing
2576                                             * else. But if the backing fs can't support that (e.g. ramfs)
2577                                             * then we can use file ownership instead. But that's only safe if
2578                                             * we can then re-mount the whole thing read-only, so that the
2579                                             * user can no longer chmod() the file to gain write access. */
2580                                 return r;
2581
2582                         if (fchown(fd, uid, GID_INVALID) < 0)
2583                                 return -errno;
2584                 }
2585         }
2586
2587         if (renameat(dfd, tmp, dfd, id) < 0)
2588                 return -errno;
2589
2590         tmp = mfree(tmp);
2591         return 0;
2592 }
2593
2594 static int acquire_credentials(
2595                 const ExecContext *context,
2596                 const ExecParameters *params,
2597                 const char *unit,
2598                 const char *p,
2599                 uid_t uid,
2600                 bool ownership_ok) {
2601
2602         uint64_t left = CREDENTIALS_TOTAL_SIZE_MAX;
2603         _cleanup_close_ int dfd = -1;
2604         ExecLoadCredential *lc;
2605         ExecSetCredential *sc;
2606         int r;
2607
2608         assert(context);
2609         assert(p);
2610
2611         dfd = open(p, O_DIRECTORY|O_CLOEXEC);
2612         if (dfd < 0)
2613                 return -errno;
2614
2615         /* First, load credentials off disk (or acquire via AF_UNIX socket) */
2616         HASHMAP_FOREACH(lc, context->load_credentials) {
2617                 ReadFullFileFlags flags = READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER;
2618                 _cleanup_(erase_and_freep) char *data = NULL;
2619                 _cleanup_free_ char *j = NULL, *bindname = NULL;
2620                 bool missing_ok = true;
2621                 const char *source;
2622                 size_t size, add;
2623
2624                 if (path_is_absolute(lc->path)) {
2625                         /* If this is an absolute path, read the data directly from it, and support AF_UNIX sockets */
2626                         source = lc->path;
2627                         flags |= READ_FULL_FILE_CONNECT_SOCKET;
2628
2629                         /* Pass some minimal info about the unit and the credential name we are looking to acquire
2630                          * via the source socket address in case we read off an AF_UNIX socket. */
2631                         if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, lc->id) < 0)
2632                                 return -ENOMEM;
2633
2634                         missing_ok = false;
2635
2636                 } else if (params->received_credentials) {
2637                         /* If this is a relative path, take it relative to the credentials we received
2638                          * ourselves. We don't support the AF_UNIX stuff in this mode, since we are operating
2639                          * on a credential store, i.e. this is guaranteed to be regular files. */
2640                         j = path_join(params->received_credentials, lc->path);
2641                         if (!j)
2642                                 return -ENOMEM;
2643
2644                         source = j;
2645                 } else
2646                         source = NULL;
2647
2648                 if (source)
2649                         r = read_full_file_full(
2650                                         AT_FDCWD, source,
2651                                         UINT64_MAX,
2652                                         lc->encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX,
2653                                         flags | (lc->encrypted ? READ_FULL_FILE_UNBASE64 : 0),
2654                                         bindname,
2655                                         &data, &size);
2656                 else
2657                         r = -ENOENT;
2658                 if (r == -ENOENT && (missing_ok || hashmap_contains(context->set_credentials, lc->id))) {
2659                         /* Make a missing inherited credential non-fatal, let's just continue. After all apps
2660                          * will get clear errors if we don't pass such a missing credential on as they
2661                          * themselves will get ENOENT when trying to read them, which should not be much
2662                          * worse than when we handle the error here and make it fatal.
2663                          *
2664                          * Also, if the source file doesn't exist, but a fallback is set via SetCredentials=
2665                          * we are fine, too. */
2666                         log_debug_errno(r, "Couldn't read inherited credential '%s', skipping: %m", lc->path);
2667                         continue;
2668                 }
2669                 if (r < 0)
2670                         return log_debug_errno(r, "Failed to read credential '%s': %m", lc->path);
2671
2672                 if (lc->encrypted) {
2673                         _cleanup_free_ void *plaintext = NULL;
2674                         size_t plaintext_size = 0;
2675
2676                         r = decrypt_credential_and_warn(lc->id, now(CLOCK_REALTIME), NULL, data, size, &plaintext, &plaintext_size);
2677                         if (r < 0)
2678                                 return r;
2679
2680                         free_and_replace(data, plaintext);
2681                         size = plaintext_size;
2682                 }
2683
2684                 add = strlen(lc->id) + size;
2685                 if (add > left)
2686                         return -E2BIG;
2687
2688                 r = write_credential(dfd, lc->id, data, size, uid, ownership_ok);
2689                 if (r < 0)
2690                         return r;
2691
2692                 left -= add;
2693         }
2694
2695         /* First we use the literally specified credentials. Note that they might be overridden again below,
2696          * and thus act as a "default" if the same credential is specified multiple times */
2697         HASHMAP_FOREACH(sc, context->set_credentials) {
2698                 _cleanup_(erase_and_freep) void *plaintext = NULL;
2699                 const char *data;
2700                 size_t size, add;
2701
2702                 if (faccessat(dfd, sc->id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0)
2703                         continue;
2704                 if (errno != ENOENT)
2705                         return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sc->id);
2706
2707                 if (sc->encrypted) {
2708                         r = decrypt_credential_and_warn(sc->id, now(CLOCK_REALTIME), NULL, sc->data, sc->size, &plaintext, &size);
2709                         if (r < 0)
2710                                 return r;
2711
2712                         data = plaintext;
2713                 } else {
2714                         data = sc->data;
2715                         size = sc->size;
2716                 }
2717
2718                 add = strlen(sc->id) + size;
2719                 if (add > left)
2720                         return -E2BIG;
2721
2722                 r = write_credential(dfd, sc->id, data, size, uid, ownership_ok);
2723                 if (r < 0)
2724                         return r;
2725
2726
2727                 left -= add;
2728         }
2729
2730         if (fchmod(dfd, 0500) < 0) /* Now take away the "w" bit */
2731                 return -errno;
2732
2733         /* After we created all keys with the right perms, also make sure the credential store as a whole is
2734          * accessible */
2735
2736         if (uid_is_valid(uid) && uid != getuid()) {
2737                 r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE);
2738                 if (r < 0) {
2739                         if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2740                                 return r;
2741
2742                         if (!ownership_ok)
2743                                 return r;
2744
2745                         if (fchown(dfd, uid, GID_INVALID) < 0)
2746                                 return -errno;
2747                 }
2748         }
2749
2750         return 0;
2751 }
2752
2753 static int setup_credentials_internal(
2754                 const ExecContext *context,
2755                 const ExecParameters *params,
2756                 const char *unit,
2757                 const char *final,        /* This is where the credential store shall eventually end up at */
2758                 const char *workspace,    /* This is where we can prepare it before moving it to the final place */
2759                 bool reuse_workspace,     /* Whether to reuse any existing workspace mount if it already is a mount */
2760                 bool must_mount,          /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
2761                 uid_t uid) {
2762
2763         int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
2764                                    * if we mounted something; false if we definitely can't mount anything */
2765         bool final_mounted;
2766         const char *where;
2767
2768         assert(context);
2769         assert(final);
2770         assert(workspace);
2771
2772         if (reuse_workspace) {
2773                 r = path_is_mount_point(workspace, NULL, 0);
2774                 if (r < 0)
2775                         return r;
2776                 if (r > 0)
2777                         workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse it, let's keep this in mind */
2778                 else
2779                         workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
2780         } else
2781                 workspace_mounted = -1; /* ditto */
2782
2783         r = path_is_mount_point(final, NULL, 0);
2784         if (r < 0)
2785                 return r;
2786         if (r > 0) {
2787                 /* If the final place already has something mounted, we use that. If the workspace also has
2788                  * something mounted we assume it's actually the same mount (but with MS_RDONLY
2789                  * different). */
2790                 final_mounted = true;
2791
2792                 if (workspace_mounted < 0) {
2793                         /* If the final place is mounted, but the workspace we isn't, then let's bind mount
2794                          * the final version to the workspace, and make it writable, so that we can make
2795                          * changes */
2796
2797                         r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
2798                         if (r < 0)
2799                                 return r;
2800
2801                         r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
2802                         if (r < 0)
2803                                 return r;
2804
2805                         workspace_mounted = true;
2806                 }
2807         } else
2808                 final_mounted = false;
2809
2810         if (workspace_mounted < 0) {
2811                 /* Nothing is mounted on the workspace yet, let's try to mount something now */
2812                 for (int try = 0;; try++) {
2813
2814                         if (try == 0) {
2815                                 /* Try "ramfs" first, since it's not swap backed */
2816                                 r = mount_nofollow_verbose(LOG_DEBUG, "ramfs", workspace, "ramfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, "mode=0700");
2817                                 if (r >= 0) {
2818                                         workspace_mounted = true;
2819                                         break;
2820                                 }
2821
2822                         } else if (try == 1) {
2823                                 _cleanup_free_ char *opts = NULL;
2824
2825                                 if (asprintf(&opts, "mode=0700,nr_inodes=1024,size=%zu", (size_t) CREDENTIALS_TOTAL_SIZE_MAX) < 0)
2826                                         return -ENOMEM;
2827
2828                                 /* Fall back to "tmpfs" otherwise */
2829                                 r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", workspace, "tmpfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, opts);
2830                                 if (r >= 0) {
2831                                         workspace_mounted = true;
2832                                         break;
2833                                 }
2834
2835                         } else {
2836                                 /* If that didn't work, try to make a bind mount from the final to the workspace, so that we can make it writable there. */
2837                                 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
2838                                 if (r < 0) {
2839                                         if (!ERRNO_IS_PRIVILEGE(r)) /* Propagate anything that isn't a permission problem */
2840                                                 return r;
2841
2842                                         if (must_mount) /* If we it's not OK to use the plain directory
2843                                                          * fallback, propagate all errors too */
2844                                                 return r;
2845
2846                                         /* If we lack privileges to bind mount stuff, then let's gracefully
2847                                          * proceed for compat with container envs, and just use the final dir
2848                                          * as is. */
2849
2850                                         workspace_mounted = false;
2851                                         break;
2852                                 }
2853
2854                                 /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
2855                                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
2856                                 if (r < 0)
2857                                         return r;
2858
2859                                 workspace_mounted = true;
2860                                 break;
2861                         }
2862                 }
2863         }
2864
2865         assert(!must_mount || workspace_mounted > 0);
2866         where = workspace_mounted ? workspace : final;
2867
2868         r = acquire_credentials(context, params, unit, where, uid, workspace_mounted);
2869         if (r < 0)
2870                 return r;
2871
2872         if (workspace_mounted) {
2873                 /* Make workspace read-only now, so that any bind mount we make from it defaults to read-only too */
2874                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
2875                 if (r < 0)
2876                         return r;
2877
2878                 /* And mount it to the final place, read-only */
2879                 if (final_mounted)
2880                         r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
2881                 else
2882                         r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
2883                 if (r < 0)
2884                         return r;
2885         } else {
2886                 _cleanup_free_ char *parent = NULL;
2887
2888                 /* If we do not have our own mount put used the plain directory fallback, then we need to
2889                  * open access to the top-level credential directory and the per-service directory now */
2890
2891                 parent = dirname_malloc(final);
2892                 if (!parent)
2893                         return -ENOMEM;
2894                 if (chmod(parent, 0755) < 0)
2895                         return -errno;
2896         }
2897
2898         return 0;
2899 }
2900
2901 static int setup_credentials(
2902                 const ExecContext *context,
2903                 const ExecParameters *params,
2904                 const char *unit,
2905                 uid_t uid) {
2906
2907         _cleanup_free_ char *p = NULL, *q = NULL;
2908         const char *i;
2909         int r;
2910
2911         assert(context);
2912         assert(params);
2913
2914         if (!exec_context_has_credentials(context))
2915                 return 0;
2916
2917         if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
2918                 return -EINVAL;
2919
2920         /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
2921          * and the subdir we mount over with a read-only file system readable by the service's user */
2922         q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials");
2923         if (!q)
2924                 return -ENOMEM;
2925
2926         r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */
2927         if (r < 0 && r != -EEXIST)
2928                 return r;
2929
2930         p = path_join(q, unit);
2931         if (!p)
2932                 return -ENOMEM;
2933
2934         r = mkdir_label(p, 0700); /* per-unit dir: private to user */
2935         if (r < 0 && r != -EEXIST)
2936                 return r;
2937
2938         r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
2939         if (r < 0) {
2940                 _cleanup_free_ char *t = NULL, *u = NULL;
2941
2942                 /* If this is not a privilege or support issue then propagate the error */
2943                 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2944                         return r;
2945
2946                 /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
2947                  * it into place, so that users can't access half-initialized credential stores. */
2948                 t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
2949                 if (!t)
2950                         return -ENOMEM;
2951
2952                 /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
2953                  * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
2954                  * after it is fully set up */
2955                 u = path_join(t, unit);
2956                 if (!u)
2957                         return -ENOMEM;
2958
2959                 FOREACH_STRING(i, t, u) {
2960                         r = mkdir_label(i, 0700);
2961                         if (r < 0 && r != -EEXIST)
2962                                 return r;
2963                 }
2964
2965                 r = setup_credentials_internal(
2966                                 context,
2967                                 params,
2968                                 unit,
2969                                 p,       /* final mount point */
2970                                 u,       /* temporary workspace to overmount */
2971                                 true,    /* reuse the workspace if it is already a mount */
2972                                 false,   /* it's OK to fall back to a plain directory if we can't mount anything */
2973                                 uid);
2974
2975                 (void) rmdir(u); /* remove the workspace again if we can. */
2976
2977                 if (r < 0)
2978                         return r;
2979
2980         } else if (r == 0) {
2981
2982                 /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
2983                  * we can use the same directory for all cases, after turning off propagation. Question
2984                  * though is: where do we turn off propagation exactly, and where do we place the workspace
2985                  * directory? We need some place that is guaranteed to be a mount point in the host, and
2986                  * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
2987                  * since we ultimately want to move the resulting file system there, i.e. we need propagation
2988                  * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
2989                  * would be visible in the host mount table all the time, which we want to avoid. Hence, what
2990                  * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
2991                  * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
2992                  * propagation on the former, and then overmount the latter.
2993                  *
2994                  * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
2995                  * for this purpose, but there are few other candidates that work equally well for us, and
2996                  * given that the we do this in a privately namespaced short-lived single-threaded process
2997                  * that no one else sees this should be OK to do. */
2998
2999                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL); /* Turn off propagation from our namespace to host */
3000                 if (r < 0)
3001                         goto child_fail;
3002
3003                 r = setup_credentials_internal(
3004                                 context,
3005                                 params,
3006                                 unit,
3007                                 p,           /* final mount point */
3008                                 "/dev/shm",  /* temporary workspace to overmount */
3009                                 false,       /* do not reuse /dev/shm if it is already a mount, under no circumstances */
3010                                 true,        /* insist that something is mounted, do not allow fallback to plain directory */
3011                                 uid);
3012                 if (r < 0)
3013                         goto child_fail;
3014
3015                 _exit(EXIT_SUCCESS);
3016
3017         child_fail:
3018                 _exit(EXIT_FAILURE);
3019         }
3020
3021         return 0;
3022 }
3023
3024 #if ENABLE_SMACK
3025 static int setup_smack(
3026                 const ExecContext *context,
3027                 int executable_fd) {
3028         int r;
3029
3030         assert(context);
3031         assert(executable_fd >= 0);
3032
3033         if (context->smack_process_label) {
3034                 r = mac_smack_apply_pid(0, context->smack_process_label);
3035                 if (r < 0)
3036                         return r;
3037         }
3038 #ifdef SMACK_DEFAULT_PROCESS_LABEL
3039         else {
3040                 _cleanup_free_ char *exec_label = NULL;
3041
3042                 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
3043                 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
3044                         return r;
3045
3046                 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
3047                 if (r < 0)
3048                         return r;
3049         }
3050 #endif
3051
3052         return 0;
3053 }
3054 #endif
3055
3056 static int compile_bind_mounts(
3057                 const ExecContext *context,
3058                 const ExecParameters *params,
3059                 BindMount **ret_bind_mounts,
3060                 size_t *ret_n_bind_mounts,
3061                 char ***ret_empty_directories) {
3062
3063         _cleanup_strv_free_ char **empty_directories = NULL;
3064         BindMount *bind_mounts;
3065         size_t n, h = 0;
3066         int r;
3067
3068         assert(context);
3069         assert(params);
3070         assert(ret_bind_mounts);
3071         assert(ret_n_bind_mounts);
3072         assert(ret_empty_directories);
3073
3074         n = context->n_bind_mounts;
3075         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3076                 if (!params->prefix[t])
3077                         continue;
3078
3079                 n += context->directories[t].n_items;
3080         }
3081
3082         if (n <= 0) {
3083                 *ret_bind_mounts = NULL;
3084                 *ret_n_bind_mounts = 0;
3085                 *ret_empty_directories = NULL;
3086                 return 0;
3087         }
3088
3089         bind_mounts = new(BindMount, n);
3090         if (!bind_mounts)
3091                 return -ENOMEM;
3092
3093         for (size_t i = 0; i < context->n_bind_mounts; i++) {
3094                 BindMount *item = context->bind_mounts + i;
3095                 char *s, *d;
3096
3097                 s = strdup(item->source);
3098                 if (!s) {
3099                         r = -ENOMEM;
3100                         goto finish;
3101                 }
3102
3103                 d = strdup(item->destination);
3104                 if (!d) {
3105                         free(s);
3106                         r = -ENOMEM;
3107                         goto finish;
3108                 }
3109
3110                 bind_mounts[h++] = (BindMount) {
3111                         .source = s,
3112                         .destination = d,
3113                         .read_only = item->read_only,
3114                         .recursive = item->recursive,
3115                         .ignore_enoent = item->ignore_enoent,
3116                 };
3117         }
3118
3119         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3120                 if (!params->prefix[t])
3121                         continue;
3122
3123                 if (context->directories[t].n_items == 0)
3124                         continue;
3125
3126                 if (exec_directory_is_private(context, t) &&
3127                     !exec_context_with_rootfs(context)) {
3128                         char *private_root;
3129
3130                         /* So this is for a dynamic user, and we need to make sure the process can access its own
3131                          * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3132                          * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3133
3134                         private_root = path_join(params->prefix[t], "private");
3135                         if (!private_root) {
3136                                 r = -ENOMEM;
3137                                 goto finish;
3138                         }
3139
3140                         r = strv_consume(&empty_directories, private_root);
3141                         if (r < 0)
3142                                 goto finish;
3143                 }
3144
3145                 for (size_t i = 0; i < context->directories[t].n_items; i++) {
3146                         char *s, *d;
3147
3148                         if (exec_directory_is_private(context, t))
3149                                 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
3150                         else
3151                                 s = path_join(params->prefix[t], context->directories[t].items[i].path);
3152                         if (!s) {
3153                                 r = -ENOMEM;
3154                                 goto finish;
3155                         }
3156
3157                         if (exec_directory_is_private(context, t) &&
3158                             exec_context_with_rootfs(context))
3159                                 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3160                                  * directory is not created on the root directory. So, let's bind-mount the directory
3161                                  * on the 'non-private' place. */
3162                                 d = path_join(params->prefix[t], context->directories[t].items[i].path);
3163                         else
3164                                 d = strdup(s);
3165                         if (!d) {
3166                                 free(s);
3167                                 r = -ENOMEM;
3168                                 goto finish;
3169                         }
3170
3171                         bind_mounts[h++] = (BindMount) {
3172                                 .source = s,
3173                                 .destination = d,
3174                                 .read_only = false,
3175                                 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
3176                                 .recursive = true,
3177                                 .ignore_enoent = false,
3178                         };
3179                 }
3180         }
3181
3182         assert(h == n);
3183
3184         *ret_bind_mounts = bind_mounts;
3185         *ret_n_bind_mounts = n;
3186         *ret_empty_directories = TAKE_PTR(empty_directories);
3187
3188         return (int) n;
3189
3190 finish:
3191         bind_mount_free_many(bind_mounts, h);
3192         return r;
3193 }
3194
3195 /* ret_symlinks will contain a list of pairs src:dest that describes
3196  * the symlinks to create later on. For example, the symlinks needed
3197  * to safely give private directories to DynamicUser=1 users. */
3198 static int compile_symlinks(
3199                 const ExecContext *context,
3200                 const ExecParameters *params,
3201                 char ***ret_symlinks) {
3202
3203         _cleanup_strv_free_ char **symlinks = NULL;
3204         int r;
3205
3206         assert(context);
3207         assert(params);
3208         assert(ret_symlinks);
3209
3210         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3211                 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
3212                         _cleanup_free_ char *private_path = NULL, *path = NULL;
3213                         char **symlink;
3214
3215                         STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
3216                                 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
3217
3218                                 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3219                                 dst_abs = path_join(params->prefix[dt], *symlink);
3220                                 if (!src_abs || !dst_abs)
3221                                         return -ENOMEM;
3222
3223                                 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
3224                                 if (r < 0)
3225                                         return r;
3226                         }
3227
3228                         if (!exec_directory_is_private(context, dt))
3229                                 continue;
3230
3231                         private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
3232                         if (!private_path)
3233                                 return -ENOMEM;
3234
3235                         path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3236                         if (!path)
3237                                 return -ENOMEM;
3238
3239                         r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
3240                         if (r < 0)
3241                                 return r;
3242                 }
3243         }
3244
3245         *ret_symlinks = TAKE_PTR(symlinks);
3246
3247         return 0;
3248 }
3249
3250 static bool insist_on_sandboxing(
3251                 const ExecContext *context,
3252                 const char *root_dir,
3253                 const char *root_image,
3254                 const BindMount *bind_mounts,
3255                 size_t n_bind_mounts) {
3256
3257         assert(context);
3258         assert(n_bind_mounts == 0 || bind_mounts);
3259
3260         /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
3261          * would alter the view on the file system beyond making things read-only or invisible, i.e. would
3262          * rearrange stuff in a way we cannot ignore gracefully. */
3263
3264         if (context->n_temporary_filesystems > 0)
3265                 return true;
3266
3267         if (root_dir || root_image)
3268                 return true;
3269
3270         if (context->n_mount_images > 0)
3271                 return true;
3272
3273         if (context->dynamic_user)
3274                 return true;
3275
3276         /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3277          * essential. */
3278         for (size_t i = 0; i < n_bind_mounts; i++)
3279                 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
3280                         return true;
3281
3282         if (context->log_namespace)
3283                 return true;
3284
3285         return false;
3286 }
3287
3288 static int apply_mount_namespace(
3289                 const Unit *u,
3290                 ExecCommandFlags command_flags,
3291                 const ExecContext *context,
3292                 const ExecParameters *params,
3293                 const ExecRuntime *runtime,
3294                 char **error_path) {
3295
3296         _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL;
3297         const char *tmp_dir = NULL, *var_tmp_dir = NULL;
3298         const char *root_dir = NULL, *root_image = NULL;
3299         _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL;
3300         NamespaceInfo ns_info;
3301         bool needs_sandboxing;
3302         BindMount *bind_mounts = NULL;
3303         size_t n_bind_mounts = 0;
3304         int r;
3305
3306         assert(context);
3307
3308         if (params->flags & EXEC_APPLY_CHROOT) {
3309                 root_image = context->root_image;
3310
3311                 if (!root_image)
3312                         root_dir = context->root_directory;
3313         }
3314
3315         r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3316         if (r < 0)
3317                 return r;
3318
3319         /* Symlinks for exec dirs are set up after other mounts, before they are made read-only. */
3320         r = compile_symlinks(context, params, &symlinks);
3321         if (r < 0)
3322                 return r;
3323
3324         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3325         if (needs_sandboxing) {
3326                 /* The runtime struct only contains the parent of the private /tmp,
3327                  * which is non-accessible to world users. Inside of it there's a /tmp
3328                  * that is sticky, and that's the one we want to use here.
3329                  * This does not apply when we are using /run/systemd/empty as fallback. */
3330
3331                 if (context->private_tmp && runtime) {
3332                         if (streq_ptr(runtime->tmp_dir, RUN_SYSTEMD_EMPTY))
3333                                 tmp_dir = runtime->tmp_dir;
3334                         else if (runtime->tmp_dir)
3335                                 tmp_dir = strjoina(runtime->tmp_dir, "/tmp");
3336
3337                         if (streq_ptr(runtime->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3338                                 var_tmp_dir = runtime->var_tmp_dir;
3339                         else if (runtime->var_tmp_dir)
3340                                 var_tmp_dir = strjoina(runtime->var_tmp_dir, "/tmp");
3341                 }
3342
3343                 ns_info = (NamespaceInfo) {
3344                         .ignore_protect_paths = false,
3345                         .private_dev = context->private_devices,
3346                         .protect_control_groups = context->protect_control_groups,
3347                         .protect_kernel_tunables = context->protect_kernel_tunables,
3348                         .protect_kernel_modules = context->protect_kernel_modules,
3349                         .protect_kernel_logs = context->protect_kernel_logs,
3350                         .protect_hostname = context->protect_hostname,
3351                         .mount_apivfs = exec_context_get_effective_mount_apivfs(context),
3352                         .private_mounts = context->private_mounts,
3353                         .protect_home = context->protect_home,
3354                         .protect_system = context->protect_system,
3355                         .protect_proc = context->protect_proc,
3356                         .proc_subset = context->proc_subset,
3357                         .private_ipc = context->private_ipc || context->ipc_namespace_path,
3358                         /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
3359                         .mount_nosuid = context->no_new_privileges && !mac_selinux_use(),
3360                 };
3361         } else if (!context->dynamic_user && root_dir)
3362                 /*
3363                  * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
3364                  * sandbox info, otherwise enforce it, don't ignore protected paths and
3365                  * fail if we are enable to apply the sandbox inside the mount namespace.
3366                  */
3367                 ns_info = (NamespaceInfo) {
3368                         .ignore_protect_paths = true,
3369                 };
3370         else
3371                 ns_info = (NamespaceInfo) {};
3372
3373         if (context->mount_flags == MS_SHARED)
3374                 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3375
3376         if (exec_context_has_credentials(context) &&
3377             params->prefix[EXEC_DIRECTORY_RUNTIME] &&
3378             FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
3379                 creds_path = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials", u->id);
3380                 if (!creds_path) {
3381                         r = -ENOMEM;
3382                         goto finalize;
3383                 }
3384         }
3385
3386         if (MANAGER_IS_SYSTEM(u->manager)) {
3387                 propagate_dir = path_join("/run/systemd/propagate/", u->id);
3388                 if (!propagate_dir) {
3389                         r = -ENOMEM;
3390                         goto finalize;
3391                 }
3392
3393                 incoming_dir = strdup("/run/systemd/incoming");
3394                 if (!incoming_dir) {
3395                         r = -ENOMEM;
3396                         goto finalize;
3397                 }
3398         }
3399
3400         r = setup_namespace(root_dir, root_image, context->root_image_options,
3401                             &ns_info, context->read_write_paths,
3402                             needs_sandboxing ? context->read_only_paths : NULL,
3403                             needs_sandboxing ? context->inaccessible_paths : NULL,
3404                             needs_sandboxing ? context->exec_paths : NULL,
3405                             needs_sandboxing ? context->no_exec_paths : NULL,
3406                             empty_directories,
3407                             symlinks,
3408                             bind_mounts,
3409                             n_bind_mounts,
3410                             context->temporary_filesystems,
3411                             context->n_temporary_filesystems,
3412                             context->mount_images,
3413                             context->n_mount_images,
3414                             tmp_dir,
3415                             var_tmp_dir,
3416                             creds_path,
3417                             context->log_namespace,
3418                             context->mount_flags,
3419                             context->root_hash, context->root_hash_size, context->root_hash_path,
3420                             context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3421                             context->root_verity,
3422                             context->extension_images,
3423                             context->n_extension_images,
3424                             propagate_dir,
3425                             incoming_dir,
3426                             root_dir || root_image ? params->notify_socket : NULL,
3427                             error_path);
3428
3429         /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
3430          * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
3431          * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3432          * completely different execution environment. */
3433         if (r == -ENOANO) {
3434                 if (insist_on_sandboxing(
3435                                     context,
3436                                     root_dir, root_image,
3437                                     bind_mounts,
3438                                     n_bind_mounts)) {
3439                         log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
3440                                        "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3441                                        n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
3442
3443                         r = -EOPNOTSUPP;
3444                 } else {
3445                         log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
3446                         r = 0;
3447                 }
3448         }
3449
3450 finalize:
3451         bind_mount_free_many(bind_mounts, n_bind_mounts);
3452         return r;
3453 }
3454
3455 static int apply_working_directory(
3456                 const ExecContext *context,
3457                 const ExecParameters *params,
3458                 const char *home,
3459                 int *exit_status) {
3460
3461         const char *d, *wd;
3462
3463         assert(context);
3464         assert(exit_status);
3465
3466         if (context->working_directory_home) {
3467
3468                 if (!home) {
3469                         *exit_status = EXIT_CHDIR;
3470                         return -ENXIO;
3471                 }
3472
3473                 wd = home;
3474
3475         } else
3476                 wd = empty_to_root(context->working_directory);
3477
3478         if (params->flags & EXEC_APPLY_CHROOT)
3479                 d = wd;
3480         else
3481                 d = prefix_roota(context->root_directory, wd);
3482
3483         if (chdir(d) < 0 && !context->working_directory_missing_ok) {
3484                 *exit_status = EXIT_CHDIR;
3485                 return -errno;
3486         }
3487
3488         return 0;
3489 }
3490
3491 static int apply_root_directory(
3492                 const ExecContext *context,
3493                 const ExecParameters *params,
3494                 const bool needs_mount_ns,
3495                 int *exit_status) {
3496
3497         assert(context);
3498         assert(exit_status);
3499
3500         if (params->flags & EXEC_APPLY_CHROOT)
3501                 if (!needs_mount_ns && context->root_directory)
3502                         if (chroot(context->root_directory) < 0) {
3503                                 *exit_status = EXIT_CHROOT;
3504                                 return -errno;
3505                         }
3506
3507         return 0;
3508 }
3509
3510 static int setup_keyring(
3511                 const Unit *u,
3512                 const ExecContext *context,
3513                 const ExecParameters *p,
3514                 uid_t uid, gid_t gid) {
3515
3516         key_serial_t keyring;
3517         int r = 0;
3518         uid_t saved_uid;
3519         gid_t saved_gid;
3520
3521         assert(u);
3522         assert(context);
3523         assert(p);
3524
3525         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3526          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3527          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3528          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3529          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3530          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3531
3532         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3533                 return 0;
3534
3535         /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3536          * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3537          * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3538          * & group is just as nasty as acquiring a reference to the user keyring. */
3539
3540         saved_uid = getuid();
3541         saved_gid = getgid();
3542
3543         if (gid_is_valid(gid) && gid != saved_gid) {
3544                 if (setregid(gid, -1) < 0)
3545                         return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
3546         }
3547
3548         if (uid_is_valid(uid) && uid != saved_uid) {
3549                 if (setreuid(uid, -1) < 0) {
3550                         r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
3551                         goto out;
3552                 }
3553         }
3554
3555         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3556         if (keyring == -1) {
3557                 if (errno == ENOSYS)
3558                         log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
3559                 else if (ERRNO_IS_PRIVILEGE(errno))
3560                         log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
3561                 else if (errno == EDQUOT)
3562                         log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
3563                 else
3564                         r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
3565
3566                 goto out;
3567         }
3568
3569         /* When requested link the user keyring into the session keyring. */
3570         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3571
3572                 if (keyctl(KEYCTL_LINK,
3573                            KEY_SPEC_USER_KEYRING,
3574                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3575                         r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
3576                         goto out;
3577                 }
3578         }
3579
3580         /* Restore uid/gid back */
3581         if (uid_is_valid(uid) && uid != saved_uid) {
3582                 if (setreuid(saved_uid, -1) < 0) {
3583                         r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
3584                         goto out;
3585                 }
3586         }
3587
3588         if (gid_is_valid(gid) && gid != saved_gid) {
3589                 if (setregid(saved_gid, -1) < 0)
3590                         return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
3591         }
3592
3593         /* Populate they keyring with the invocation ID by default, as original saved_uid. */
3594         if (!sd_id128_is_null(u->invocation_id)) {
3595                 key_serial_t key;
3596
3597                 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
3598                 if (key == -1)
3599                         log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
3600                 else {
3601                         if (keyctl(KEYCTL_SETPERM, key,
3602                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3603                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
3604                                 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
3605                 }
3606         }
3607
3608 out:
3609         /* Revert back uid & gid for the last time, and exit */
3610         /* no extra logging, as only the first already reported error matters */
3611         if (getuid() != saved_uid)
3612                 (void) setreuid(saved_uid, -1);
3613
3614         if (getgid() != saved_gid)
3615                 (void) setregid(saved_gid, -1);
3616
3617         return r;
3618 }
3619
3620 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
3621         assert(array);
3622         assert(n);
3623         assert(pair);
3624
3625         if (pair[0] >= 0)
3626                 array[(*n)++] = pair[0];
3627         if (pair[1] >= 0)
3628                 array[(*n)++] = pair[1];
3629 }
3630
3631 static int close_remaining_fds(
3632                 const ExecParameters *params,
3633                 const ExecRuntime *runtime,
3634                 const DynamicCreds *dcreds,
3635                 int user_lookup_fd,
3636                 int socket_fd,
3637                 const int *fds, size_t n_fds) {
3638
3639         size_t n_dont_close = 0;
3640         int dont_close[n_fds + 12];
3641
3642         assert(params);
3643
3644         if (params->stdin_fd >= 0)
3645                 dont_close[n_dont_close++] = params->stdin_fd;
3646         if (params->stdout_fd >= 0)
3647                 dont_close[n_dont_close++] = params->stdout_fd;
3648         if (params->stderr_fd >= 0)
3649                 dont_close[n_dont_close++] = params->stderr_fd;
3650
3651         if (socket_fd >= 0)
3652                 dont_close[n_dont_close++] = socket_fd;
3653         if (n_fds > 0) {
3654                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3655                 n_dont_close += n_fds;
3656         }
3657
3658         if (runtime) {
3659                 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
3660                 append_socket_pair(dont_close, &n_dont_close, runtime->ipcns_storage_socket);
3661         }
3662
3663         if (dcreds) {
3664                 if (dcreds->user)
3665                         append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
3666                 if (dcreds->group)
3667                         append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
3668         }
3669
3670         if (user_lookup_fd >= 0)
3671                 dont_close[n_dont_close++] = user_lookup_fd;
3672
3673         return close_all_fds(dont_close, n_dont_close);
3674 }
3675
3676 static int send_user_lookup(
3677                 Unit *unit,
3678                 int user_lookup_fd,
3679                 uid_t uid,
3680                 gid_t gid) {
3681
3682         assert(unit);
3683
3684         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3685          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3686          * specified. */
3687
3688         if (user_lookup_fd < 0)
3689                 return 0;
3690
3691         if (!uid_is_valid(uid) && !gid_is_valid(gid))
3692                 return 0;
3693
3694         if (writev(user_lookup_fd,
3695                (struct iovec[]) {
3696                            IOVEC_INIT(&uid, sizeof(uid)),
3697                            IOVEC_INIT(&gid, sizeof(gid)),
3698                            IOVEC_INIT_STRING(unit->id) }, 3) < 0)
3699                 return -errno;
3700
3701         return 0;
3702 }
3703
3704 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3705         int r;
3706
3707         assert(c);
3708         assert(home);
3709         assert(buf);
3710
3711         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3712
3713         if (*home)
3714                 return 0;
3715
3716         if (!c->working_directory_home)
3717                 return 0;
3718
3719         r = get_home_dir(buf);
3720         if (r < 0)
3721                 return r;
3722
3723         *home = *buf;
3724         return 1;
3725 }
3726
3727 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3728         _cleanup_strv_free_ char ** list = NULL;
3729         int r;
3730
3731         assert(c);
3732         assert(p);
3733         assert(ret);
3734
3735         assert(c->dynamic_user);
3736
3737         /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3738          * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3739          * directories. */
3740
3741         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3742                 if (t == EXEC_DIRECTORY_CONFIGURATION)
3743                         continue;
3744
3745                 if (!p->prefix[t])
3746                         continue;
3747
3748                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
3749                         char *e;
3750
3751                         if (exec_directory_is_private(c, t))
3752                                 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
3753                         else
3754                                 e = path_join(p->prefix[t], c->directories[t].items[i].path);
3755                         if (!e)
3756                                 return -ENOMEM;
3757
3758                         r = strv_consume(&list, e);
3759                         if (r < 0)
3760                                 return r;
3761                 }
3762         }
3763
3764         *ret = TAKE_PTR(list);
3765
3766         return 0;
3767 }
3768
3769 static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
3770         bool using_subcgroup;
3771         char *p;
3772
3773         assert(params);
3774         assert(ret);
3775
3776         if (!params->cgroup_path)
3777                 return -EINVAL;
3778
3779         /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
3780          * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
3781          * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
3782          * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
3783          * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
3784          * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
3785          * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
3786          * flag, which is only passed for the former statements, not for the latter. */
3787
3788         using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
3789         if (using_subcgroup)
3790                 p = path_join(params->cgroup_path, ".control");
3791         else
3792                 p = strdup(params->cgroup_path);
3793         if (!p)
3794                 return -ENOMEM;
3795
3796         *ret = p;
3797         return using_subcgroup;
3798 }
3799
3800 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
3801         _cleanup_(cpu_set_reset) CPUSet s = {};
3802         int r;
3803
3804         assert(c);
3805         assert(ret);
3806
3807         if (!c->numa_policy.nodes.set) {
3808                 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3809                 return 0;
3810         }
3811
3812         r = numa_to_cpu_set(&c->numa_policy, &s);
3813         if (r < 0)
3814                 return r;
3815
3816         cpu_set_reset(ret);
3817
3818         return cpu_set_add_all(ret, &s);
3819 }
3820
3821 bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
3822         assert(c);
3823
3824         return c->cpu_affinity_from_numa;
3825 }
3826
3827 static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
3828         int r;
3829
3830         assert(fds);
3831         assert(n_fds);
3832         assert(*n_fds < fds_size);
3833         assert(ret_fd);
3834
3835         if (fd < 0) {
3836                 *ret_fd = -1;
3837                 return 0;
3838         }
3839
3840         if (fd < 3 + (int) *n_fds) {
3841                 /* Let's move the fd up, so that it's outside of the fd range we will use to store
3842                  * the fds we pass to the process (or which are closed only during execve). */
3843
3844                 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
3845                 if (r < 0)
3846                         return -errno;
3847
3848                 CLOSE_AND_REPLACE(fd, r);
3849         }
3850
3851         *ret_fd = fds[*n_fds] = fd;
3852         (*n_fds) ++;
3853         return 1;
3854 }
3855
3856 static int exec_child(
3857                 Unit *unit,
3858                 const ExecCommand *command,
3859                 const ExecContext *context,
3860                 const ExecParameters *params,
3861                 ExecRuntime *runtime,
3862                 DynamicCreds *dcreds,
3863                 int socket_fd,
3864                 const int named_iofds[static 3],
3865                 int *fds,
3866                 size_t n_socket_fds,
3867                 size_t n_storage_fds,
3868                 char **files_env,
3869                 int user_lookup_fd,
3870                 int *exit_status) {
3871
3872         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
3873         int r, ngids = 0, exec_fd;
3874         _cleanup_free_ gid_t *supplementary_gids = NULL;
3875         const char *username = NULL, *groupname = NULL;
3876         _cleanup_free_ char *home_buffer = NULL;
3877         const char *home = NULL, *shell = NULL;
3878         char **final_argv = NULL;
3879         dev_t journal_stream_dev = 0;
3880         ino_t journal_stream_ino = 0;
3881         bool userns_set_up = false;
3882         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
3883                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
3884                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
3885                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
3886 #if HAVE_SELINUX
3887         _cleanup_free_ char *mac_selinux_context_net = NULL;
3888         bool use_selinux = false;
3889 #endif
3890 #if ENABLE_SMACK
3891         bool use_smack = false;
3892 #endif
3893 #if HAVE_APPARMOR
3894         bool use_apparmor = false;
3895 #endif
3896         uid_t saved_uid = getuid();
3897         gid_t saved_gid = getgid();
3898         uid_t uid = UID_INVALID;
3899         gid_t gid = GID_INVALID;
3900         size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
3901                n_keep_fds; /* total number of fds not to close */
3902         int secure_bits;
3903         _cleanup_free_ gid_t *gids_after_pam = NULL;
3904         int ngids_after_pam = 0;
3905
3906         assert(unit);
3907         assert(command);
3908         assert(context);
3909         assert(params);
3910         assert(exit_status);
3911
3912         rename_process_from_path(command->path);
3913
3914         /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
3915          * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
3916          * both of which will be demoted to SIG_DFL. */
3917         (void) default_signals(SIGNALS_CRASH_HANDLER,
3918                                SIGNALS_IGNORE);
3919
3920         if (context->ignore_sigpipe)
3921                 (void) ignore_signals(SIGPIPE);
3922
3923         r = reset_signal_mask();
3924         if (r < 0) {
3925                 *exit_status = EXIT_SIGNAL_MASK;
3926                 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
3927         }
3928
3929         if (params->idle_pipe)
3930                 do_idle_pipe_dance(params->idle_pipe);
3931
3932         /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
3933          * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
3934          * any fds open we don't really want open during the transition. In order to make logging work, we switch the
3935          * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
3936
3937         log_forget_fds();
3938         log_set_open_when_needed(true);
3939
3940         /* In case anything used libc syslog(), close this here, too */
3941         closelog();
3942
3943         int keep_fds[n_fds + 3];
3944         memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
3945         n_keep_fds = n_fds;
3946
3947         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
3948         if (r < 0) {
3949                 *exit_status = EXIT_FDS;
3950                 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
3951         }
3952
3953 #if HAVE_LIBBPF
3954         if (MANAGER_IS_SYSTEM(unit->manager) && lsm_bpf_supported()) {
3955                 int bpf_map_fd = -1;
3956
3957                 bpf_map_fd = lsm_bpf_map_restrict_fs_fd(unit);
3958                 if (bpf_map_fd < 0) {
3959                         *exit_status = EXIT_FDS;
3960                         return log_unit_error_errno(unit, r, "Failed to get restrict filesystems BPF map fd: %m");
3961                 }
3962
3963                 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, bpf_map_fd, &bpf_map_fd);
3964                 if (r < 0) {
3965                         *exit_status = EXIT_FDS;
3966                         return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
3967                 }
3968         }
3969 #endif
3970
3971         r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
3972         if (r < 0) {
3973                 *exit_status = EXIT_FDS;
3974                 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
3975         }
3976
3977         if (!context->same_pgrp &&
3978             setsid() < 0) {
3979                 *exit_status = EXIT_SETSID;
3980                 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
3981         }
3982
3983         exec_context_tty_reset(context, params);
3984
3985         if (unit_shall_confirm_spawn(unit)) {
3986                 const char *vc = params->confirm_spawn;
3987                 _cleanup_free_ char *cmdline = NULL;
3988
3989                 cmdline = quote_command_line(command->argv);
3990                 if (!cmdline) {
3991                         *exit_status = EXIT_MEMORY;
3992                         return log_oom();
3993                 }
3994
3995                 r = ask_for_confirmation(vc, unit, cmdline);
3996                 if (r != CONFIRM_EXECUTE) {
3997                         if (r == CONFIRM_PRETEND_SUCCESS) {
3998                                 *exit_status = EXIT_SUCCESS;
3999                                 return 0;
4000                         }
4001                         *exit_status = EXIT_CONFIRM;
4002                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
4003                                                     "Execution cancelled by the user");
4004                 }
4005         }
4006
4007         /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4008          * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4009          * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4010          * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4011          * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4012         if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
4013             setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
4014                 *exit_status = EXIT_MEMORY;
4015                 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4016         }
4017
4018         if (context->dynamic_user && dcreds) {
4019                 _cleanup_strv_free_ char **suggested_paths = NULL;
4020
4021                 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
4022                  * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
4023                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4024                         *exit_status = EXIT_USER;
4025                         return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4026                 }
4027
4028                 r = compile_suggested_paths(context, params, &suggested_paths);
4029                 if (r < 0) {
4030                         *exit_status = EXIT_MEMORY;
4031                         return log_oom();
4032                 }
4033
4034                 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
4035                 if (r < 0) {
4036                         *exit_status = EXIT_USER;
4037                         if (r == -EILSEQ)
4038                                 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4039                                                             "Failed to update dynamic user credentials: User or group with specified name already exists.");
4040                         return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
4041                 }
4042
4043                 if (!uid_is_valid(uid)) {
4044                         *exit_status = EXIT_USER;
4045                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
4046                 }
4047
4048                 if (!gid_is_valid(gid)) {
4049                         *exit_status = EXIT_USER;
4050                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
4051                 }
4052
4053                 if (dcreds->user)
4054                         username = dcreds->user->name;
4055
4056         } else {
4057                 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
4058                 if (r < 0) {
4059                         *exit_status = EXIT_USER;
4060                         return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
4061                 }
4062
4063                 r = get_fixed_group(context, &groupname, &gid);
4064                 if (r < 0) {
4065                         *exit_status = EXIT_GROUP;
4066                         return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
4067                 }
4068         }
4069
4070         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4071         r = get_supplementary_groups(context, username, groupname, gid,
4072                                      &supplementary_gids, &ngids);
4073         if (r < 0) {
4074                 *exit_status = EXIT_GROUP;
4075                 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
4076         }
4077
4078         r = send_user_lookup(unit, user_lookup_fd, uid, gid);
4079         if (r < 0) {
4080                 *exit_status = EXIT_USER;
4081                 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
4082         }
4083
4084         user_lookup_fd = safe_close(user_lookup_fd);
4085
4086         r = acquire_home(context, uid, &home, &home_buffer);
4087         if (r < 0) {
4088                 *exit_status = EXIT_CHDIR;
4089                 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
4090         }
4091
4092         /* If a socket is connected to STDIN/STDOUT/STDERR, we
4093          * must sure to drop O_NONBLOCK */
4094         if (socket_fd >= 0)
4095                 (void) fd_nonblock(socket_fd, false);
4096
4097         /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4098          * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4099         if (params->cgroup_path) {
4100                 _cleanup_free_ char *p = NULL;
4101
4102                 r = exec_parameters_get_cgroup_path(params, &p);
4103                 if (r < 0) {
4104                         *exit_status = EXIT_CGROUP;
4105                         return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
4106                 }
4107
4108                 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
4109                 if (r < 0) {
4110                         *exit_status = EXIT_CGROUP;
4111                         return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
4112                 }
4113         }
4114
4115         if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
4116                 r = open_shareable_ns_path(runtime->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
4117                 if (r < 0) {
4118                         *exit_status = EXIT_NETWORK;
4119                         return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
4120                 }
4121         }
4122
4123         if (context->ipc_namespace_path && runtime && runtime->ipcns_storage_socket[0] >= 0) {
4124                 r = open_shareable_ns_path(runtime->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
4125                 if (r < 0) {
4126                         *exit_status = EXIT_NAMESPACE;
4127                         return log_unit_error_errno(unit, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
4128                 }
4129         }
4130
4131         r = setup_input(context, params, socket_fd, named_iofds);
4132         if (r < 0) {
4133                 *exit_status = EXIT_STDIN;
4134                 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
4135         }
4136
4137         r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4138         if (r < 0) {
4139                 *exit_status = EXIT_STDOUT;
4140                 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
4141         }
4142
4143         r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4144         if (r < 0) {
4145                 *exit_status = EXIT_STDERR;
4146                 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
4147         }
4148
4149         if (context->oom_score_adjust_set) {
4150                 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
4151                  * prohibit write access to this file, and we shouldn't trip up over that. */
4152                 r = set_oom_score_adjust(context->oom_score_adjust);
4153                 if (ERRNO_IS_PRIVILEGE(r))
4154                         log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
4155                 else if (r < 0) {
4156                         *exit_status = EXIT_OOM_ADJUST;
4157                         return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
4158                 }
4159         }
4160
4161         if (context->coredump_filter_set) {
4162                 r = set_coredump_filter(context->coredump_filter);
4163                 if (ERRNO_IS_PRIVILEGE(r))
4164                         log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
4165                 else if (r < 0)
4166                         return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
4167         }
4168
4169         if (context->nice_set) {
4170                 r = setpriority_closest(context->nice);
4171                 if (r < 0)
4172                         return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
4173         }
4174
4175         if (context->cpu_sched_set) {
4176                 struct sched_param param = {
4177                         .sched_priority = context->cpu_sched_priority,
4178                 };
4179
4180                 r = sched_setscheduler(0,
4181                                        context->cpu_sched_policy |
4182                                        (context->cpu_sched_reset_on_fork ?
4183                                         SCHED_RESET_ON_FORK : 0),
4184                                        &param);
4185                 if (r < 0) {
4186                         *exit_status = EXIT_SETSCHEDULER;
4187                         return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
4188                 }
4189         }
4190
4191         if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4192                 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4193                 const CPUSet *cpu_set;
4194
4195                 if (context->cpu_affinity_from_numa) {
4196                         r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4197                         if (r < 0) {
4198                                 *exit_status = EXIT_CPUAFFINITY;
4199                                 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4200                         }
4201
4202                         cpu_set = &converted_cpu_set;
4203                 } else
4204                         cpu_set = &context->cpu_set;
4205
4206                 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
4207                         *exit_status = EXIT_CPUAFFINITY;
4208                         return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
4209                 }
4210         }
4211
4212         if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4213                 r = apply_numa_policy(&context->numa_policy);
4214                 if (r == -EOPNOTSUPP)
4215                         log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
4216                 else if (r < 0) {
4217                         *exit_status = EXIT_NUMA_POLICY;
4218                         return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
4219                 }
4220         }
4221
4222         if (context->ioprio_set)
4223                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
4224                         *exit_status = EXIT_IOPRIO;
4225                         return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
4226                 }
4227
4228         if (context->timer_slack_nsec != NSEC_INFINITY)
4229                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
4230                         *exit_status = EXIT_TIMERSLACK;
4231                         return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
4232                 }
4233
4234         if (context->personality != PERSONALITY_INVALID) {
4235                 r = safe_personality(context->personality);
4236                 if (r < 0) {
4237                         *exit_status = EXIT_PERSONALITY;
4238                         return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
4239                 }
4240         }
4241
4242         if (context->utmp_id) {
4243                 const char *line = context->tty_path ?
4244                         (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
4245                         NULL;
4246                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
4247                                       line,
4248                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
4249                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4250                                       USER_PROCESS,
4251                                       username);
4252         }
4253
4254         if (uid_is_valid(uid)) {
4255                 r = chown_terminal(STDIN_FILENO, uid);
4256                 if (r < 0) {
4257                         *exit_status = EXIT_STDIN;
4258                         return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
4259                 }
4260         }
4261
4262         /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
4263          * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4264          * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
4265          * touch a single hierarchy too. */
4266         if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
4267                 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
4268                 if (r < 0) {
4269                         *exit_status = EXIT_CGROUP;
4270                         return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
4271                 }
4272         }
4273
4274         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
4275
4276         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4277                 r = setup_exec_directory(context, params, uid, gid, dt, needs_mount_namespace, exit_status);
4278                 if (r < 0)
4279                         return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
4280         }
4281
4282         if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4283                 r = setup_credentials(context, params, unit->id, uid);
4284                 if (r < 0) {
4285                         *exit_status = EXIT_CREDENTIALS;
4286                         return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
4287                 }
4288         }
4289
4290         r = build_environment(
4291                         unit,
4292                         context,
4293                         params,
4294                         n_fds,
4295                         home,
4296                         username,
4297                         shell,
4298                         journal_stream_dev,
4299                         journal_stream_ino,
4300                         &our_env);
4301         if (r < 0) {
4302                 *exit_status = EXIT_MEMORY;
4303                 return log_oom();
4304         }
4305
4306         r = build_pass_environment(context, &pass_env);
4307         if (r < 0) {
4308                 *exit_status = EXIT_MEMORY;
4309                 return log_oom();
4310         }
4311
4312         /* The PATH variable is set to the default path in params->environment.
4313          * However, this is overridden if user specified fields have PATH set.
4314          * The intention is to also override PATH if the user does
4315          * not specify PATH and the user has specified ExecSearchPath
4316          */
4317
4318         if (!strv_isempty(context->exec_search_path)) {
4319                 _cleanup_free_ char *joined = NULL;
4320
4321                 joined = strv_join(context->exec_search_path, ":");
4322                 if (!joined) {
4323                         *exit_status = EXIT_MEMORY;
4324                         return log_oom();
4325                 }
4326
4327                 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
4328                 if (r < 0) {
4329                         *exit_status = EXIT_MEMORY;
4330                         return log_oom();
4331                 }
4332         }
4333
4334         accum_env = strv_env_merge(params->environment,
4335                                    our_env,
4336                                    joined_exec_search_path,
4337                                    pass_env,
4338                                    context->environment,
4339                                    files_env);
4340         if (!accum_env) {
4341                 *exit_status = EXIT_MEMORY;
4342                 return log_oom();
4343         }
4344         accum_env = strv_env_clean(accum_env);
4345
4346         (void) umask(context->umask);
4347
4348         r = setup_keyring(unit, context, params, uid, gid);
4349         if (r < 0) {
4350                 *exit_status = EXIT_KEYRING;
4351                 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
4352         }
4353
4354         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
4355         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
4356
4357         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
4358         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
4359
4360         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
4361         if (needs_ambient_hack)
4362                 needs_setuid = false;
4363         else
4364                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4365
4366         if (needs_sandboxing) {
4367                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
4368                  * present. The actual MAC context application will happen later, as late as possible, to avoid
4369                  * impacting our own code paths. */
4370
4371 #if HAVE_SELINUX
4372                 use_selinux = mac_selinux_use();
4373 #endif
4374 #if ENABLE_SMACK
4375                 use_smack = mac_smack_use();
4376 #endif
4377 #if HAVE_APPARMOR
4378                 use_apparmor = mac_apparmor_use();
4379 #endif
4380         }
4381
4382         if (needs_sandboxing) {
4383                 int which_failed;
4384
4385                 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4386                  * is set here. (See below.) */
4387
4388                 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4389                 if (r < 0) {
4390                         *exit_status = EXIT_LIMITS;
4391                         return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4392                 }
4393         }
4394
4395         if (needs_setuid && context->pam_name && username) {
4396                 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
4397                  * wins here. (See above.) */
4398
4399                 /* All fds passed in the fds array will be closed in the pam child process. */
4400                 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
4401                 if (r < 0) {
4402                         *exit_status = EXIT_PAM;
4403                         return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
4404                 }
4405
4406                 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4407                 if (ngids_after_pam < 0) {
4408                         *exit_status = EXIT_MEMORY;
4409                         return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
4410                 }
4411         }
4412
4413         if (needs_sandboxing && context->private_users && !have_effective_cap(CAP_SYS_ADMIN)) {
4414                 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4415                  * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4416                  * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
4417
4418                 userns_set_up = true;
4419                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4420                 if (r < 0) {
4421                         *exit_status = EXIT_USER;
4422                         return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
4423                 }
4424         }
4425
4426         if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
4427
4428                 if (ns_type_supported(NAMESPACE_NET)) {
4429                         r = setup_shareable_ns(runtime->netns_storage_socket, CLONE_NEWNET);
4430                         if (r == -EPERM)
4431                                 log_unit_warning_errno(unit, r,
4432                                                        "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
4433                         else if (r < 0) {
4434                                 *exit_status = EXIT_NETWORK;
4435                                 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
4436                         }
4437                 } else if (context->network_namespace_path) {
4438                         *exit_status = EXIT_NETWORK;
4439                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4440                                                     "NetworkNamespacePath= is not supported, refusing.");
4441                 } else
4442                         log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
4443         }
4444
4445         if ((context->private_ipc || context->ipc_namespace_path) && runtime && runtime->ipcns_storage_socket[0] >= 0) {
4446
4447                 if (ns_type_supported(NAMESPACE_IPC)) {
4448                         r = setup_shareable_ns(runtime->ipcns_storage_socket, CLONE_NEWIPC);
4449                         if (r == -EPERM)
4450                                 log_unit_warning_errno(unit, r,
4451                                                        "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4452                         else if (r < 0) {
4453                                 *exit_status = EXIT_NAMESPACE;
4454                                 return log_unit_error_errno(unit, r, "Failed to set up IPC namespacing: %m");
4455                         }
4456                 } else if (context->ipc_namespace_path) {
4457                         *exit_status = EXIT_NAMESPACE;
4458                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4459                                                     "IPCNamespacePath= is not supported, refusing.");
4460                 } else
4461                         log_unit_warning(unit, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4462         }
4463
4464         if (needs_mount_namespace) {
4465                 _cleanup_free_ char *error_path = NULL;
4466
4467                 r = apply_mount_namespace(unit, command->flags, context, params, runtime, &error_path);
4468                 if (r < 0) {
4469                         *exit_status = EXIT_NAMESPACE;
4470                         return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
4471                                                     error_path ? ": " : "", strempty(error_path));
4472                 }
4473         }
4474
4475         if (needs_sandboxing) {
4476                 r = apply_protect_hostname(unit, context, exit_status);
4477                 if (r < 0)
4478                         return r;
4479         }
4480
4481         /* Drop groups as early as possible.
4482          * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
4483          * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
4484         if (needs_setuid) {
4485                 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4486                 int ngids_to_enforce = 0;
4487
4488                 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4489                                                    ngids,
4490                                                    gids_after_pam,
4491                                                    ngids_after_pam,
4492                                                    &gids_to_enforce);
4493                 if (ngids_to_enforce < 0) {
4494                         *exit_status = EXIT_MEMORY;
4495                         return log_unit_error_errno(unit,
4496                                                     ngids_to_enforce,
4497                                                     "Failed to merge group lists. Group membership might be incorrect: %m");
4498                 }
4499
4500                 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
4501                 if (r < 0) {
4502                         *exit_status = EXIT_GROUP;
4503                         return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
4504                 }
4505         }
4506
4507         /* If the user namespace was not set up above, try to do it now.
4508          * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4509          * restricted by rules pertaining to combining user namspaces with other namespaces (e.g. in the
4510          * case of mount namespaces being less privileged when the mount point list is copied from a
4511          * different user namespace). */
4512
4513         if (needs_sandboxing && context->private_users && !userns_set_up) {
4514                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4515                 if (r < 0) {
4516                         *exit_status = EXIT_USER;
4517                         return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
4518                 }
4519         }
4520
4521         /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4522          * shall execute. */
4523
4524         _cleanup_free_ char *executable = NULL;
4525         _cleanup_close_ int executable_fd = -1;
4526         r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
4527         if (r < 0) {
4528                 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
4529                         log_unit_struct_errno(unit, LOG_INFO, r,
4530                                               "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4531                                               LOG_UNIT_INVOCATION_ID(unit),
4532                                               LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
4533                                                                command->path),
4534                                               "EXECUTABLE=%s", command->path);
4535                         return 0;
4536                 }
4537
4538                 *exit_status = EXIT_EXEC;
4539
4540                 return log_unit_struct_errno(unit, LOG_INFO, r,
4541                                              "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4542                                              LOG_UNIT_INVOCATION_ID(unit),
4543                                              LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
4544                                                               command->path),
4545                                              "EXECUTABLE=%s", command->path);
4546         }
4547
4548         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
4549         if (r < 0) {
4550                 *exit_status = EXIT_FDS;
4551                 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4552         }
4553
4554 #if HAVE_SELINUX
4555         if (needs_sandboxing && use_selinux && params->selinux_context_net) {
4556                 int fd = -1;
4557
4558                 if (socket_fd >= 0)
4559                         fd = socket_fd;
4560                 else if (params->n_socket_fds == 1)
4561                         /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4562                          * use context from that fd to compute the label. */
4563                         fd = params->fds[0];
4564
4565                 if (fd >= 0) {
4566                         r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
4567                         if (r < 0) {
4568                                 *exit_status = EXIT_SELINUX_CONTEXT;
4569                                 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
4570                         }
4571                 }
4572         }
4573 #endif
4574
4575         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
4576          * more aggressive this time since socket_fd and the netns and ipcns fds we don't need anymore. We do keep the exec_fd
4577          * however if we have it as we want to keep it open until the final execve(). */
4578
4579         r = close_all_fds(keep_fds, n_keep_fds);
4580         if (r >= 0)
4581                 r = shift_fds(fds, n_fds);
4582         if (r >= 0)
4583                 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
4584         if (r < 0) {
4585                 *exit_status = EXIT_FDS;
4586                 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
4587         }
4588
4589         /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4590          * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4591          * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4592          * came this far. */
4593
4594         secure_bits = context->secure_bits;
4595
4596         if (needs_sandboxing) {
4597                 uint64_t bset;
4598
4599                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
4600                  * requested. (Note this is placed after the general resource limit initialization, see
4601                  * above, in order to take precedence.) */
4602                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
4603                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
4604                                 *exit_status = EXIT_LIMITS;
4605                                 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
4606                         }
4607                 }
4608
4609 #if ENABLE_SMACK
4610                 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4611                  * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4612                 if (use_smack) {
4613                         r = setup_smack(context, executable_fd);
4614                         if (r < 0) {
4615                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
4616                                 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
4617                         }
4618                 }
4619 #endif
4620
4621                 bset = context->capability_bounding_set;
4622                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4623                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4624                  * instead of us doing that */
4625                 if (needs_ambient_hack)
4626                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
4627                                 (UINT64_C(1) << CAP_SETUID) |
4628                                 (UINT64_C(1) << CAP_SETGID);
4629
4630                 if (!cap_test_all(bset)) {
4631                         r = capability_bounding_set_drop(bset, false);
4632                         if (r < 0) {
4633                                 *exit_status = EXIT_CAPABILITIES;
4634                                 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
4635                         }
4636                 }
4637
4638                 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4639                  * keep-caps set.
4640                  * To be able to raise the ambient capabilities after setresuid() they have to be
4641                  * added to the inherited set and keep caps has to be set (done in enforce_user()).
4642                  * After setresuid() the ambient capabilities can be raised as they are present in
4643                  * the permitted and inhertiable set. However it is possible that someone wants to
4644                  * set ambient capabilities without changing the user, so we also set the ambient
4645                  * capabilities here.
4646                  * The requested ambient capabilities are raised in the inheritable set if the
4647                  * second argument is true. */
4648                 if (!needs_ambient_hack) {
4649                         r = capability_ambient_set_apply(context->capability_ambient_set, true);
4650                         if (r < 0) {
4651                                 *exit_status = EXIT_CAPABILITIES;
4652                                 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
4653                         }
4654                 }
4655         }
4656
4657         /* chroot to root directory first, before we lose the ability to chroot */
4658         r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
4659         if (r < 0)
4660                 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
4661
4662         if (needs_setuid) {
4663                 if (uid_is_valid(uid)) {
4664                         r = enforce_user(context, uid);
4665                         if (r < 0) {
4666                                 *exit_status = EXIT_USER;
4667                                 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
4668                         }
4669
4670                         if (!needs_ambient_hack &&
4671                             context->capability_ambient_set != 0) {
4672
4673                                 /* Raise the ambient capabilities after user change. */
4674                                 r = capability_ambient_set_apply(context->capability_ambient_set, false);
4675                                 if (r < 0) {
4676                                         *exit_status = EXIT_CAPABILITIES;
4677                                         return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
4678                                 }
4679                         }
4680                 }
4681         }
4682
4683         /* Apply working directory here, because the working directory might be on NFS and only the user running
4684          * this service might have the correct privilege to change to the working directory */
4685         r = apply_working_directory(context, params, home, exit_status);
4686         if (r < 0)
4687                 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
4688
4689         if (needs_sandboxing) {
4690                 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
4691                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
4692                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
4693                  * are restricted. */
4694
4695 #if HAVE_SELINUX
4696                 if (use_selinux) {
4697                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
4698
4699                         if (exec_context) {
4700                                 r = setexeccon(exec_context);
4701                                 if (r < 0) {
4702                                         *exit_status = EXIT_SELINUX_CONTEXT;
4703                                         return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
4704                                 }
4705                         }
4706                 }
4707 #endif
4708
4709 #if HAVE_APPARMOR
4710                 if (use_apparmor && context->apparmor_profile) {
4711                         r = aa_change_onexec(context->apparmor_profile);
4712                         if (r < 0 && !context->apparmor_profile_ignore) {
4713                                 *exit_status = EXIT_APPARMOR_PROFILE;
4714                                 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
4715                         }
4716                 }
4717 #endif
4718
4719                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
4720                  * we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits requires
4721                  * CAP_SETPCAP. */
4722                 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
4723                         /* CAP_SETPCAP is required to set securebits. This capability is raised into the
4724                          * effective set here.
4725                          * The effective set is overwritten during execve  with the following  values:
4726                          * - ambient set (for non-root processes)
4727                          * - (inheritable | bounding) set for root processes)
4728                          *
4729                          * Hence there is no security impact to raise it in the effective set before execve
4730                          */
4731                         r = capability_gain_cap_setpcap(NULL);
4732                         if (r < 0) {
4733                                 *exit_status = EXIT_CAPABILITIES;
4734                                 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
4735                         }
4736                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
4737                                 *exit_status = EXIT_SECUREBITS;
4738                                 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
4739                         }
4740                 }
4741
4742                 if (context_has_no_new_privileges(context))
4743                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
4744                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
4745                                 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
4746                         }
4747
4748 #if HAVE_SECCOMP
4749                 r = apply_address_families(unit, context);
4750                 if (r < 0) {
4751                         *exit_status = EXIT_ADDRESS_FAMILIES;
4752                         return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
4753                 }
4754
4755                 r = apply_memory_deny_write_execute(unit, context);
4756                 if (r < 0) {
4757                         *exit_status = EXIT_SECCOMP;
4758                         return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
4759                 }
4760
4761                 r = apply_restrict_realtime(unit, context);
4762                 if (r < 0) {
4763                         *exit_status = EXIT_SECCOMP;
4764                         return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
4765                 }
4766
4767                 r = apply_restrict_suid_sgid(unit, context);
4768                 if (r < 0) {
4769                         *exit_status = EXIT_SECCOMP;
4770                         return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
4771                 }
4772
4773                 r = apply_restrict_namespaces(unit, context);
4774                 if (r < 0) {
4775                         *exit_status = EXIT_SECCOMP;
4776                         return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
4777                 }
4778
4779                 r = apply_protect_sysctl(unit, context);
4780                 if (r < 0) {
4781                         *exit_status = EXIT_SECCOMP;
4782                         return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
4783                 }
4784
4785                 r = apply_protect_kernel_modules(unit, context);
4786                 if (r < 0) {
4787                         *exit_status = EXIT_SECCOMP;
4788                         return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
4789                 }
4790
4791                 r = apply_protect_kernel_logs(unit, context);
4792                 if (r < 0) {
4793                         *exit_status = EXIT_SECCOMP;
4794                         return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
4795                 }
4796
4797                 r = apply_protect_clock(unit, context);
4798                 if (r < 0) {
4799                         *exit_status = EXIT_SECCOMP;
4800                         return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
4801                 }
4802
4803                 r = apply_private_devices(unit, context);
4804                 if (r < 0) {
4805                         *exit_status = EXIT_SECCOMP;
4806                         return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
4807                 }
4808
4809                 r = apply_syscall_archs(unit, context);
4810                 if (r < 0) {
4811                         *exit_status = EXIT_SECCOMP;
4812                         return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
4813                 }
4814
4815                 r = apply_lock_personality(unit, context);
4816                 if (r < 0) {
4817                         *exit_status = EXIT_SECCOMP;
4818                         return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
4819                 }
4820
4821                 r = apply_syscall_log(unit, context);
4822                 if (r < 0) {
4823                         *exit_status = EXIT_SECCOMP;
4824                         return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
4825                 }
4826
4827                 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
4828                  * by the filter as little as possible. */
4829                 r = apply_syscall_filter(unit, context, needs_ambient_hack);
4830                 if (r < 0) {
4831                         *exit_status = EXIT_SECCOMP;
4832                         return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
4833                 }
4834 #endif
4835
4836 #if HAVE_LIBBPF
4837                 r = apply_restrict_filesystems(unit, context);
4838                 if (r < 0) {
4839                         *exit_status = EXIT_BPF;
4840                         return log_unit_error_errno(unit, r, "Failed to restrict filesystems: %m");
4841                 }
4842 #endif
4843
4844         }
4845
4846         if (!strv_isempty(context->unset_environment)) {
4847                 char **ee = NULL;
4848
4849                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
4850                 if (!ee) {
4851                         *exit_status = EXIT_MEMORY;
4852                         return log_oom();
4853                 }
4854
4855                 strv_free_and_replace(accum_env, ee);
4856         }
4857
4858         if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
4859                 replaced_argv = replace_env_argv(command->argv, accum_env);
4860                 if (!replaced_argv) {
4861                         *exit_status = EXIT_MEMORY;
4862                         return log_oom();
4863                 }
4864                 final_argv = replaced_argv;
4865         } else
4866                 final_argv = command->argv;
4867
4868         if (DEBUG_LOGGING) {
4869                 _cleanup_free_ char *line = NULL;
4870
4871                 line = quote_command_line(final_argv);
4872                 if (!line) {
4873                         *exit_status = EXIT_MEMORY;
4874                         return log_oom();
4875                 }
4876
4877                 log_unit_struct(unit, LOG_DEBUG,
4878                                 "EXECUTABLE=%s", executable,
4879                                 LOG_UNIT_MESSAGE(unit, "Executing: %s", line));
4880         }
4881
4882         if (exec_fd >= 0) {
4883                 uint8_t hot = 1;
4884
4885                 /* We have finished with all our initializations. Let's now let the manager know that. From this point
4886                  * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
4887
4888                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
4889                         *exit_status = EXIT_EXEC;
4890                         return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
4891                 }
4892         }
4893
4894         r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
4895
4896         if (exec_fd >= 0) {
4897                 uint8_t hot = 0;
4898
4899                 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
4900                  * that POLLHUP on it no longer means execve() succeeded. */
4901
4902                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
4903                         *exit_status = EXIT_EXEC;
4904                         return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
4905                 }
4906         }
4907
4908         *exit_status = EXIT_EXEC;
4909         return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
4910 }
4911
4912 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
4913 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
4914
4915 int exec_spawn(Unit *unit,
4916                ExecCommand *command,
4917                const ExecContext *context,
4918                const ExecParameters *params,
4919                ExecRuntime *runtime,
4920                DynamicCreds *dcreds,
4921                pid_t *ret) {
4922
4923         int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
4924         _cleanup_free_ char *subcgroup_path = NULL;
4925         _cleanup_strv_free_ char **files_env = NULL;
4926         size_t n_storage_fds = 0, n_socket_fds = 0;
4927         _cleanup_free_ char *line = NULL;
4928         pid_t pid;
4929
4930         assert(unit);
4931         assert(command);
4932         assert(context);
4933         assert(ret);
4934         assert(params);
4935         assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
4936
4937         if (context->std_input == EXEC_INPUT_SOCKET ||
4938             context->std_output == EXEC_OUTPUT_SOCKET ||
4939             context->std_error == EXEC_OUTPUT_SOCKET) {
4940
4941                 if (params->n_socket_fds > 1)
4942                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
4943
4944                 if (params->n_socket_fds == 0)
4945                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
4946
4947                 socket_fd = params->fds[0];
4948         } else {
4949                 socket_fd = -1;
4950                 fds = params->fds;
4951                 n_socket_fds = params->n_socket_fds;
4952                 n_storage_fds = params->n_storage_fds;
4953         }
4954
4955         r = exec_context_named_iofds(context, params, named_iofds);
4956         if (r < 0)
4957                 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
4958
4959         r = exec_context_load_environment(unit, context, &files_env);
4960         if (r < 0)
4961                 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
4962
4963         line = quote_command_line(command->argv);
4964         if (!line)
4965                 return log_oom();
4966
4967         /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
4968            and, until the next SELinux policy changes, we save further reloads in future children. */
4969         mac_selinux_maybe_reload();
4970
4971         log_unit_struct(unit, LOG_DEBUG,
4972                         LOG_UNIT_MESSAGE(unit, "About to execute %s", line),
4973                         "EXECUTABLE=%s", command->path, /* We won't know the real executable path until we create
4974                                                            the mount namespace in the child, but we want to log
4975                                                            from the parent, so we need to use the (possibly
4976                                                            inaccurate) path here. */
4977                         LOG_UNIT_INVOCATION_ID(unit));
4978
4979         if (params->cgroup_path) {
4980                 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
4981                 if (r < 0)
4982                         return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
4983                 if (r > 0) { /* We are using a child cgroup */
4984                         r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
4985                         if (r < 0)
4986                                 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
4987
4988                         /* Normally we would not propagate the oomd xattrs to children but since we created this
4989                          * sub-cgroup internally we should do it. */
4990                         cgroup_oomd_xattr_apply(unit, subcgroup_path);
4991                 }
4992         }
4993
4994         pid = fork();
4995         if (pid < 0)
4996                 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
4997
4998         if (pid == 0) {
4999                 int exit_status = EXIT_SUCCESS;
5000
5001                 r = exec_child(unit,
5002                                command,
5003                                context,
5004                                params,
5005                                runtime,
5006                                dcreds,
5007                                socket_fd,
5008                                named_iofds,
5009                                fds,
5010                                n_socket_fds,
5011                                n_storage_fds,
5012                                files_env,
5013                                unit->manager->user_lookup_fds[1],
5014                                &exit_status);
5015
5016                 if (r < 0) {
5017                         const char *status =
5018                                 exit_status_to_string(exit_status,
5019                                                       EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
5020
5021                         log_unit_struct_errno(unit, LOG_ERR, r,
5022                                               "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5023                                               LOG_UNIT_INVOCATION_ID(unit),
5024                                               LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
5025                                                                status, command->path),
5026                                               "EXECUTABLE=%s", command->path);
5027                 }
5028
5029                 _exit(exit_status);
5030         }
5031
5032         log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
5033
5034         /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
5035          * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
5036          * process will be killed too). */
5037         if (subcgroup_path)
5038                 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
5039
5040         exec_status_start(&command->exec_status, pid);
5041
5042         *ret = pid;
5043         return 0;
5044 }
5045
5046 void exec_context_init(ExecContext *c) {
5047         assert(c);
5048
5049         c->umask = 0022;
5050         c->ioprio = ioprio_prio_value(IOPRIO_CLASS_BE, 0);
5051         c->cpu_sched_policy = SCHED_OTHER;
5052         c->syslog_priority = LOG_DAEMON|LOG_INFO;
5053         c->syslog_level_prefix = true;
5054         c->ignore_sigpipe = true;
5055         c->timer_slack_nsec = NSEC_INFINITY;
5056         c->personality = PERSONALITY_INVALID;
5057         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5058                 c->directories[t].mode = 0755;
5059         c->timeout_clean_usec = USEC_INFINITY;
5060         c->capability_bounding_set = CAP_ALL;
5061         assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
5062         c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
5063         c->log_level_max = -1;
5064 #if HAVE_SECCOMP
5065         c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL;
5066 #endif
5067         numa_policy_reset(&c->numa_policy);
5068 }
5069
5070 void exec_context_done(ExecContext *c) {
5071         assert(c);
5072
5073         c->environment = strv_free(c->environment);
5074         c->environment_files = strv_free(c->environment_files);
5075         c->pass_environment = strv_free(c->pass_environment);
5076         c->unset_environment = strv_free(c->unset_environment);
5077
5078         rlimit_free_all(c->rlimit);
5079
5080         for (size_t l = 0; l < 3; l++) {
5081                 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
5082                 c->stdio_file[l] = mfree(c->stdio_file[l]);
5083         }
5084
5085         c->working_directory = mfree(c->working_directory);
5086         c->root_directory = mfree(c->root_directory);
5087         c->root_image = mfree(c->root_image);
5088         c->root_image_options = mount_options_free_all(c->root_image_options);
5089         c->root_hash = mfree(c->root_hash);
5090         c->root_hash_size = 0;
5091         c->root_hash_path = mfree(c->root_hash_path);
5092         c->root_hash_sig = mfree(c->root_hash_sig);
5093         c->root_hash_sig_size = 0;
5094         c->root_hash_sig_path = mfree(c->root_hash_sig_path);
5095         c->root_verity = mfree(c->root_verity);
5096         c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
5097         c->tty_path = mfree(c->tty_path);
5098         c->syslog_identifier = mfree(c->syslog_identifier);
5099         c->user = mfree(c->user);
5100         c->group = mfree(c->group);
5101
5102         c->supplementary_groups = strv_free(c->supplementary_groups);
5103
5104         c->pam_name = mfree(c->pam_name);
5105
5106         c->read_only_paths = strv_free(c->read_only_paths);
5107         c->read_write_paths = strv_free(c->read_write_paths);
5108         c->inaccessible_paths = strv_free(c->inaccessible_paths);
5109         c->exec_paths = strv_free(c->exec_paths);
5110         c->no_exec_paths = strv_free(c->no_exec_paths);
5111         c->exec_search_path = strv_free(c->exec_search_path);
5112
5113         bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
5114         c->bind_mounts = NULL;
5115         c->n_bind_mounts = 0;
5116         temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
5117         c->temporary_filesystems = NULL;
5118         c->n_temporary_filesystems = 0;
5119         c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
5120
5121         cpu_set_reset(&c->cpu_set);
5122         numa_policy_reset(&c->numa_policy);
5123
5124         c->utmp_id = mfree(c->utmp_id);
5125         c->selinux_context = mfree(c->selinux_context);
5126         c->apparmor_profile = mfree(c->apparmor_profile);
5127         c->smack_process_label = mfree(c->smack_process_label);
5128
5129         c->restrict_filesystems = set_free(c->restrict_filesystems);
5130
5131         c->syscall_filter = hashmap_free(c->syscall_filter);
5132         c->syscall_archs = set_free(c->syscall_archs);
5133         c->address_families = set_free(c->address_families);
5134
5135         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5136                 exec_directory_done(&c->directories[t]);
5137
5138         c->log_level_max = -1;
5139
5140         exec_context_free_log_extra_fields(c);
5141
5142         c->log_ratelimit_interval_usec = 0;
5143         c->log_ratelimit_burst = 0;
5144
5145         c->stdin_data = mfree(c->stdin_data);
5146         c->stdin_data_size = 0;
5147
5148         c->network_namespace_path = mfree(c->network_namespace_path);
5149         c->ipc_namespace_path = mfree(c->ipc_namespace_path);
5150
5151         c->log_namespace = mfree(c->log_namespace);
5152
5153         c->load_credentials = hashmap_free(c->load_credentials);
5154         c->set_credentials = hashmap_free(c->set_credentials);
5155 }
5156
5157 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
5158         assert(c);
5159
5160         if (!runtime_prefix)
5161                 return 0;
5162
5163         for (size_t i = 0; i < c->directories[EXEC_DIRECTORY_RUNTIME].n_items; i++) {
5164                 _cleanup_free_ char *p = NULL;
5165
5166                 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5167                         p = path_join(runtime_prefix, "private", c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
5168                 else
5169                         p = path_join(runtime_prefix, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
5170                 if (!p)
5171                         return -ENOMEM;
5172
5173                 /* We execute this synchronously, since we need to be sure this is gone when we start the
5174                  * service next. */
5175                 (void) rm_rf(p, REMOVE_ROOT);
5176
5177                 char **symlink;
5178                 STRV_FOREACH(symlink, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].symlinks) {
5179                         _cleanup_free_ char *symlink_abs = NULL;
5180
5181                         if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5182                                 symlink_abs = path_join(runtime_prefix, "private", *symlink);
5183                         else
5184                                 symlink_abs = path_join(runtime_prefix, *symlink);
5185                         if (!symlink_abs)
5186                                 return -ENOMEM;
5187
5188                         (void) unlink(symlink_abs);
5189                 }
5190
5191         }
5192
5193         return 0;
5194 }
5195
5196 int exec_context_destroy_credentials(const ExecContext *c, const char *runtime_prefix, const char *unit) {
5197         _cleanup_free_ char *p = NULL;
5198
5199         assert(c);
5200
5201         if (!runtime_prefix || !unit)
5202                 return 0;
5203
5204         p = path_join(runtime_prefix, "credentials", unit);
5205         if (!p)
5206                 return -ENOMEM;
5207
5208         /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
5209          * unmount it, and afterwards remove the mount point */
5210         (void) umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW);
5211         (void) rm_rf(p, REMOVE_ROOT|REMOVE_CHMOD);
5212
5213         return 0;
5214 }
5215
5216 static void exec_command_done(ExecCommand *c) {
5217         assert(c);
5218
5219         c->path = mfree(c->path);
5220         c->argv = strv_free(c->argv);
5221 }
5222
5223 void exec_command_done_array(ExecCommand *c, size_t n) {
5224         for (size_t i = 0; i < n; i++)
5225                 exec_command_done(c+i);
5226 }
5227
5228 ExecCommand* exec_command_free_list(ExecCommand *c) {
5229         ExecCommand *i;
5230
5231         while ((i = c)) {
5232                 LIST_REMOVE(command, c, i);
5233                 exec_command_done(i);
5234                 free(i);
5235         }
5236
5237         return NULL;
5238 }
5239
5240 void exec_command_free_array(ExecCommand **c, size_t n) {
5241         for (size_t i = 0; i < n; i++)
5242                 c[i] = exec_command_free_list(c[i]);
5243 }
5244
5245 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
5246         for (size_t i = 0; i < n; i++)
5247                 exec_status_reset(&c[i].exec_status);
5248 }
5249
5250 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
5251         for (size_t i = 0; i < n; i++) {
5252                 ExecCommand *z;
5253
5254                 LIST_FOREACH(command, z, c[i])
5255                         exec_status_reset(&z->exec_status);
5256         }
5257 }
5258
5259 typedef struct InvalidEnvInfo {
5260         const Unit *unit;
5261         const char *path;
5262 } InvalidEnvInfo;
5263
5264 static void invalid_env(const char *p, void *userdata) {
5265         InvalidEnvInfo *info = userdata;
5266
5267         log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
5268 }
5269
5270 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
5271         assert(c);
5272
5273         switch (fd_index) {
5274
5275         case STDIN_FILENO:
5276                 if (c->std_input != EXEC_INPUT_NAMED_FD)
5277                         return NULL;
5278
5279                 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
5280
5281         case STDOUT_FILENO:
5282                 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
5283                         return NULL;
5284
5285                 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
5286
5287         case STDERR_FILENO:
5288                 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
5289                         return NULL;
5290
5291                 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
5292
5293         default:
5294                 return NULL;
5295         }
5296 }
5297
5298 static int exec_context_named_iofds(
5299                 const ExecContext *c,
5300                 const ExecParameters *p,
5301                 int named_iofds[static 3]) {
5302
5303         size_t targets;
5304         const char* stdio_fdname[3];
5305         size_t n_fds;
5306
5307         assert(c);
5308         assert(p);
5309         assert(named_iofds);
5310
5311         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
5312                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
5313                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
5314
5315         for (size_t i = 0; i < 3; i++)
5316                 stdio_fdname[i] = exec_context_fdname(c, i);
5317
5318         n_fds = p->n_storage_fds + p->n_socket_fds;
5319
5320         for (size_t i = 0; i < n_fds  && targets > 0; i++)
5321                 if (named_iofds[STDIN_FILENO] < 0 &&
5322                     c->std_input == EXEC_INPUT_NAMED_FD &&
5323                     stdio_fdname[STDIN_FILENO] &&
5324                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
5325
5326                         named_iofds[STDIN_FILENO] = p->fds[i];
5327                         targets--;
5328
5329                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
5330                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
5331                            stdio_fdname[STDOUT_FILENO] &&
5332                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
5333
5334                         named_iofds[STDOUT_FILENO] = p->fds[i];
5335                         targets--;
5336
5337                 } else if (named_iofds[STDERR_FILENO] < 0 &&
5338                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
5339                            stdio_fdname[STDERR_FILENO] &&
5340                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
5341
5342                         named_iofds[STDERR_FILENO] = p->fds[i];
5343                         targets--;
5344                 }
5345
5346         return targets == 0 ? 0 : -ENOENT;
5347 }
5348
5349 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
5350         char **i, **r = NULL;
5351
5352         assert(c);
5353         assert(l);
5354
5355         STRV_FOREACH(i, c->environment_files) {
5356                 char *fn;
5357                 int k;
5358                 bool ignore = false;
5359                 char **p;
5360                 _cleanup_globfree_ glob_t pglob = {};
5361
5362                 fn = *i;
5363
5364                 if (fn[0] == '-') {
5365                         ignore = true;
5366                         fn++;
5367                 }
5368
5369                 if (!path_is_absolute(fn)) {
5370                         if (ignore)
5371                                 continue;
5372
5373                         strv_free(r);
5374                         return -EINVAL;
5375                 }
5376
5377                 /* Filename supports globbing, take all matching files */
5378                 k = safe_glob(fn, 0, &pglob);
5379                 if (k < 0) {
5380                         if (ignore)
5381                                 continue;
5382
5383                         strv_free(r);
5384                         return k;
5385                 }
5386
5387                 /* When we don't match anything, -ENOENT should be returned */
5388                 assert(pglob.gl_pathc > 0);
5389
5390                 for (unsigned n = 0; n < pglob.gl_pathc; n++) {
5391                         k = load_env_file(NULL, pglob.gl_pathv[n], &p);
5392                         if (k < 0) {
5393                                 if (ignore)
5394                                         continue;
5395
5396                                 strv_free(r);
5397                                 return k;
5398                         }
5399                         /* Log invalid environment variables with filename */
5400                         if (p) {
5401                                 InvalidEnvInfo info = {
5402                                         .unit = unit,
5403                                         .path = pglob.gl_pathv[n]
5404                                 };
5405
5406                                 p = strv_env_clean_with_callback(p, invalid_env, &info);
5407                         }
5408
5409                         if (!r)
5410                                 r = p;
5411                         else {
5412                                 char **m;
5413
5414                                 m = strv_env_merge(r, p);
5415                                 strv_free(r);
5416                                 strv_free(p);
5417                                 if (!m)
5418                                         return -ENOMEM;
5419
5420                                 r = m;
5421                         }
5422                 }
5423         }
5424
5425         *l = r;
5426
5427         return 0;
5428 }
5429
5430 static bool tty_may_match_dev_console(const char *tty) {
5431         _cleanup_free_ char *resolved = NULL;
5432
5433         if (!tty)
5434                 return true;
5435
5436         tty = skip_dev_prefix(tty);
5437
5438         /* trivial identity? */
5439         if (streq(tty, "console"))
5440                 return true;
5441
5442         if (resolve_dev_console(&resolved) < 0)
5443                 return true; /* if we could not resolve, assume it may */
5444
5445         /* "tty0" means the active VC, so it may be the same sometimes */
5446         return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
5447 }
5448
5449 static bool exec_context_may_touch_tty(const ExecContext *ec) {
5450         assert(ec);
5451
5452         return ec->tty_reset ||
5453                 ec->tty_vhangup ||
5454                 ec->tty_vt_disallocate ||
5455                 is_terminal_input(ec->std_input) ||
5456                 is_terminal_output(ec->std_output) ||
5457                 is_terminal_output(ec->std_error);
5458 }
5459
5460 bool exec_context_may_touch_console(const ExecContext *ec) {
5461
5462         return exec_context_may_touch_tty(ec) &&
5463                tty_may_match_dev_console(exec_context_tty_path(ec));
5464 }
5465
5466 static void strv_fprintf(FILE *f, char **l) {
5467         char **g;
5468
5469         assert(f);
5470
5471         STRV_FOREACH(g, l)
5472                 fprintf(f, " %s", *g);
5473 }
5474
5475 static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
5476         assert(f);
5477         assert(prefix);
5478         assert(name);
5479
5480         if (!strv_isempty(strv)) {
5481                 fprintf(f, "%s%s:", prefix, name);
5482                 strv_fprintf(f, strv);
5483                 fputs("\n", f);
5484         }
5485 }
5486
5487 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
5488         char **e, **d;
5489         int r;
5490
5491         assert(c);
5492         assert(f);
5493
5494         prefix = strempty(prefix);
5495
5496         fprintf(f,
5497                 "%sUMask: %04o\n"
5498                 "%sWorkingDirectory: %s\n"
5499                 "%sRootDirectory: %s\n"
5500                 "%sNonBlocking: %s\n"
5501                 "%sPrivateTmp: %s\n"
5502                 "%sPrivateDevices: %s\n"
5503                 "%sProtectKernelTunables: %s\n"
5504                 "%sProtectKernelModules: %s\n"
5505                 "%sProtectKernelLogs: %s\n"
5506                 "%sProtectClock: %s\n"
5507                 "%sProtectControlGroups: %s\n"
5508                 "%sPrivateNetwork: %s\n"
5509                 "%sPrivateUsers: %s\n"
5510                 "%sProtectHome: %s\n"
5511                 "%sProtectSystem: %s\n"
5512                 "%sMountAPIVFS: %s\n"
5513                 "%sIgnoreSIGPIPE: %s\n"
5514                 "%sMemoryDenyWriteExecute: %s\n"
5515                 "%sRestrictRealtime: %s\n"
5516                 "%sRestrictSUIDSGID: %s\n"
5517                 "%sKeyringMode: %s\n"
5518                 "%sProtectHostname: %s\n"
5519                 "%sProtectProc: %s\n"
5520                 "%sProcSubset: %s\n",
5521                 prefix, c->umask,
5522                 prefix, empty_to_root(c->working_directory),
5523                 prefix, empty_to_root(c->root_directory),
5524                 prefix, yes_no(c->non_blocking),
5525                 prefix, yes_no(c->private_tmp),
5526                 prefix, yes_no(c->private_devices),
5527                 prefix, yes_no(c->protect_kernel_tunables),
5528                 prefix, yes_no(c->protect_kernel_modules),
5529                 prefix, yes_no(c->protect_kernel_logs),
5530                 prefix, yes_no(c->protect_clock),
5531                 prefix, yes_no(c->protect_control_groups),
5532                 prefix, yes_no(c->private_network),
5533                 prefix, yes_no(c->private_users),
5534                 prefix, protect_home_to_string(c->protect_home),
5535                 prefix, protect_system_to_string(c->protect_system),
5536                 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
5537                 prefix, yes_no(c->ignore_sigpipe),
5538                 prefix, yes_no(c->memory_deny_write_execute),
5539                 prefix, yes_no(c->restrict_realtime),
5540                 prefix, yes_no(c->restrict_suid_sgid),
5541                 prefix, exec_keyring_mode_to_string(c->keyring_mode),
5542                 prefix, yes_no(c->protect_hostname),
5543                 prefix, protect_proc_to_string(c->protect_proc),
5544                 prefix, proc_subset_to_string(c->proc_subset));
5545
5546         if (c->root_image)
5547                 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
5548
5549         if (c->root_image_options) {
5550                 MountOptions *o;
5551
5552                 fprintf(f, "%sRootImageOptions:", prefix);
5553                 LIST_FOREACH(mount_options, o, c->root_image_options)
5554                         if (!isempty(o->options))
5555                                 fprintf(f, " %s:%s",
5556                                         partition_designator_to_string(o->partition_designator),
5557                                         o->options);
5558                 fprintf(f, "\n");
5559         }
5560
5561         if (c->root_hash) {
5562                 _cleanup_free_ char *encoded = NULL;
5563                 encoded = hexmem(c->root_hash, c->root_hash_size);
5564                 if (encoded)
5565                         fprintf(f, "%sRootHash: %s\n", prefix, encoded);
5566         }
5567
5568         if (c->root_hash_path)
5569                 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
5570
5571         if (c->root_hash_sig) {
5572                 _cleanup_free_ char *encoded = NULL;
5573                 ssize_t len;
5574                 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
5575                 if (len)
5576                         fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
5577         }
5578
5579         if (c->root_hash_sig_path)
5580                 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
5581
5582         if (c->root_verity)
5583                 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
5584
5585         STRV_FOREACH(e, c->environment)
5586                 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
5587
5588         STRV_FOREACH(e, c->environment_files)
5589                 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
5590
5591         STRV_FOREACH(e, c->pass_environment)
5592                 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
5593
5594         STRV_FOREACH(e, c->unset_environment)
5595                 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
5596
5597         fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
5598
5599         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
5600                 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
5601
5602                 for (size_t i = 0; i < c->directories[dt].n_items; i++) {
5603                         fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].items[i].path);
5604
5605                         STRV_FOREACH(d, c->directories[dt].items[i].symlinks)
5606                                 fprintf(f, "%s%s: %s:%s\n", prefix, exec_directory_type_symlink_to_string(dt), c->directories[dt].items[i].path, *d);
5607                 }
5608         }
5609
5610         fprintf(f, "%sTimeoutCleanSec: %s\n", prefix, FORMAT_TIMESPAN(c->timeout_clean_usec, USEC_PER_SEC));
5611
5612         if (c->nice_set)
5613                 fprintf(f, "%sNice: %i\n", prefix, c->nice);
5614
5615         if (c->oom_score_adjust_set)
5616                 fprintf(f, "%sOOMScoreAdjust: %i\n", prefix, c->oom_score_adjust);
5617
5618         if (c->coredump_filter_set)
5619                 fprintf(f, "%sCoredumpFilter: 0x%"PRIx64"\n", prefix, c->coredump_filter);
5620
5621         for (unsigned i = 0; i < RLIM_NLIMITS; i++)
5622                 if (c->rlimit[i]) {
5623                         fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
5624                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
5625                         fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
5626                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
5627                 }
5628
5629         if (c->ioprio_set) {
5630                 _cleanup_free_ char *class_str = NULL;
5631
5632                 r = ioprio_class_to_string_alloc(ioprio_prio_class(c->ioprio), &class_str);
5633                 if (r >= 0)
5634                         fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
5635
5636                 fprintf(f, "%sIOPriority: %d\n", prefix, ioprio_prio_data(c->ioprio));
5637         }
5638
5639         if (c->cpu_sched_set) {
5640                 _cleanup_free_ char *policy_str = NULL;
5641
5642                 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
5643                 if (r >= 0)
5644                         fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
5645
5646                 fprintf(f,
5647                         "%sCPUSchedulingPriority: %i\n"
5648                         "%sCPUSchedulingResetOnFork: %s\n",
5649                         prefix, c->cpu_sched_priority,
5650                         prefix, yes_no(c->cpu_sched_reset_on_fork));
5651         }
5652
5653         if (c->cpu_set.set) {
5654                 _cleanup_free_ char *affinity = NULL;
5655
5656                 affinity = cpu_set_to_range_string(&c->cpu_set);
5657                 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
5658         }
5659
5660         if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
5661                 _cleanup_free_ char *nodes = NULL;
5662
5663                 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
5664                 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
5665                 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
5666         }
5667
5668         if (c->timer_slack_nsec != NSEC_INFINITY)
5669                 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
5670
5671         fprintf(f,
5672                 "%sStandardInput: %s\n"
5673                 "%sStandardOutput: %s\n"
5674                 "%sStandardError: %s\n",
5675                 prefix, exec_input_to_string(c->std_input),
5676                 prefix, exec_output_to_string(c->std_output),
5677                 prefix, exec_output_to_string(c->std_error));
5678
5679         if (c->std_input == EXEC_INPUT_NAMED_FD)
5680                 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
5681         if (c->std_output == EXEC_OUTPUT_NAMED_FD)
5682                 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
5683         if (c->std_error == EXEC_OUTPUT_NAMED_FD)
5684                 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
5685
5686         if (c->std_input == EXEC_INPUT_FILE)
5687                 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
5688         if (c->std_output == EXEC_OUTPUT_FILE)
5689                 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5690         if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
5691                 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5692         if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
5693                 fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5694         if (c->std_error == EXEC_OUTPUT_FILE)
5695                 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5696         if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
5697                 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5698         if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
5699                 fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5700
5701         if (c->tty_path)
5702                 fprintf(f,
5703                         "%sTTYPath: %s\n"
5704                         "%sTTYReset: %s\n"
5705                         "%sTTYVHangup: %s\n"
5706                         "%sTTYVTDisallocate: %s\n",
5707                         prefix, c->tty_path,
5708                         prefix, yes_no(c->tty_reset),
5709                         prefix, yes_no(c->tty_vhangup),
5710                         prefix, yes_no(c->tty_vt_disallocate));
5711
5712         if (IN_SET(c->std_output,
5713                    EXEC_OUTPUT_KMSG,
5714                    EXEC_OUTPUT_JOURNAL,
5715                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
5716                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
5717             IN_SET(c->std_error,
5718                    EXEC_OUTPUT_KMSG,
5719                    EXEC_OUTPUT_JOURNAL,
5720                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
5721                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
5722
5723                 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
5724
5725                 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
5726                 if (r >= 0)
5727                         fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
5728
5729                 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
5730                 if (r >= 0)
5731                         fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
5732         }
5733
5734         if (c->log_level_max >= 0) {
5735                 _cleanup_free_ char *t = NULL;
5736
5737                 (void) log_level_to_string_alloc(c->log_level_max, &t);
5738
5739                 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
5740         }
5741
5742         if (c->log_ratelimit_interval_usec > 0)
5743                 fprintf(f,
5744                         "%sLogRateLimitIntervalSec: %s\n",
5745                         prefix, FORMAT_TIMESPAN(c->log_ratelimit_interval_usec, USEC_PER_SEC));
5746
5747         if (c->log_ratelimit_burst > 0)
5748                 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
5749
5750         for (size_t j = 0; j < c->n_log_extra_fields; j++) {
5751                 fprintf(f, "%sLogExtraFields: ", prefix);
5752                 fwrite(c->log_extra_fields[j].iov_base,
5753                        1, c->log_extra_fields[j].iov_len,
5754                        f);
5755                 fputc('\n', f);
5756         }
5757
5758         if (c->log_namespace)
5759                 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
5760
5761         if (c->secure_bits) {
5762                 _cleanup_free_ char *str = NULL;
5763
5764                 r = secure_bits_to_string_alloc(c->secure_bits, &str);
5765                 if (r >= 0)
5766                         fprintf(f, "%sSecure Bits: %s\n", prefix, str);
5767         }
5768
5769         if (c->capability_bounding_set != CAP_ALL) {
5770                 _cleanup_free_ char *str = NULL;
5771
5772                 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
5773                 if (r >= 0)
5774                         fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
5775         }
5776
5777         if (c->capability_ambient_set != 0) {
5778                 _cleanup_free_ char *str = NULL;
5779
5780                 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
5781                 if (r >= 0)
5782                         fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
5783         }
5784
5785         if (c->user)
5786                 fprintf(f, "%sUser: %s\n", prefix, c->user);
5787         if (c->group)
5788                 fprintf(f, "%sGroup: %s\n", prefix, c->group);
5789
5790         fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
5791
5792         strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
5793
5794         if (c->pam_name)
5795                 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
5796
5797         strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
5798         strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
5799         strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
5800         strv_dump(f, prefix, "ExecPaths", c->exec_paths);
5801         strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
5802         strv_dump(f, prefix, "ExecSearchPath", c->exec_search_path);
5803
5804         for (size_t i = 0; i < c->n_bind_mounts; i++)
5805                 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
5806                         c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
5807                         c->bind_mounts[i].ignore_enoent ? "-": "",
5808                         c->bind_mounts[i].source,
5809                         c->bind_mounts[i].destination,
5810                         c->bind_mounts[i].recursive ? "rbind" : "norbind");
5811
5812         for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
5813                 const TemporaryFileSystem *t = c->temporary_filesystems + i;
5814
5815                 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
5816                         t->path,
5817                         isempty(t->options) ? "" : ":",
5818                         strempty(t->options));
5819         }
5820
5821         if (c->utmp_id)
5822                 fprintf(f,
5823                         "%sUtmpIdentifier: %s\n",
5824                         prefix, c->utmp_id);
5825
5826         if (c->selinux_context)
5827                 fprintf(f,
5828                         "%sSELinuxContext: %s%s\n",
5829                         prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
5830
5831         if (c->apparmor_profile)
5832                 fprintf(f,
5833                         "%sAppArmorProfile: %s%s\n",
5834                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
5835
5836         if (c->smack_process_label)
5837                 fprintf(f,
5838                         "%sSmackProcessLabel: %s%s\n",
5839                         prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
5840
5841         if (c->personality != PERSONALITY_INVALID)
5842                 fprintf(f,
5843                         "%sPersonality: %s\n",
5844                         prefix, strna(personality_to_string(c->personality)));
5845
5846         fprintf(f,
5847                 "%sLockPersonality: %s\n",
5848                 prefix, yes_no(c->lock_personality));
5849
5850         if (c->syscall_filter) {
5851 #if HAVE_SECCOMP
5852                 void *id, *val;
5853                 bool first = true;
5854 #endif
5855
5856                 fprintf(f,
5857                         "%sSystemCallFilter: ",
5858                         prefix);
5859
5860                 if (!c->syscall_allow_list)
5861                         fputc('~', f);
5862
5863 #if HAVE_SECCOMP
5864                 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
5865                         _cleanup_free_ char *name = NULL;
5866                         const char *errno_name = NULL;
5867                         int num = PTR_TO_INT(val);
5868
5869                         if (first)
5870                                 first = false;
5871                         else
5872                                 fputc(' ', f);
5873
5874                         name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
5875                         fputs(strna(name), f);
5876
5877                         if (num >= 0) {
5878                                 errno_name = seccomp_errno_or_action_to_string(num);
5879                                 if (errno_name)
5880                                         fprintf(f, ":%s", errno_name);
5881                                 else
5882                                         fprintf(f, ":%d", num);
5883                         }
5884                 }
5885 #endif
5886
5887                 fputc('\n', f);
5888         }
5889
5890         if (c->syscall_archs) {
5891 #if HAVE_SECCOMP
5892                 void *id;
5893 #endif
5894
5895                 fprintf(f,
5896                         "%sSystemCallArchitectures:",
5897                         prefix);
5898
5899 #if HAVE_SECCOMP
5900                 SET_FOREACH(id, c->syscall_archs)
5901                         fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
5902 #endif
5903                 fputc('\n', f);
5904         }
5905
5906         if (exec_context_restrict_namespaces_set(c)) {
5907                 _cleanup_free_ char *s = NULL;
5908
5909                 r = namespace_flags_to_string(c->restrict_namespaces, &s);
5910                 if (r >= 0)
5911                         fprintf(f, "%sRestrictNamespaces: %s\n",
5912                                 prefix, strna(s));
5913         }
5914
5915 #if HAVE_LIBBPF
5916         if (exec_context_restrict_filesystems_set(c))
5917                 SET_FOREACH(e, c->restrict_filesystems)
5918                         fprintf(f, "%sRestrictFileSystems: %s\n", prefix, *e);
5919 #endif
5920
5921         if (c->network_namespace_path)
5922                 fprintf(f,
5923                         "%sNetworkNamespacePath: %s\n",
5924                         prefix, c->network_namespace_path);
5925
5926         if (c->syscall_errno > 0) {
5927 #if HAVE_SECCOMP
5928                 const char *errno_name;
5929 #endif
5930
5931                 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
5932
5933 #if HAVE_SECCOMP
5934                 errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
5935                 if (errno_name)
5936                         fputs(errno_name, f);
5937                 else
5938                         fprintf(f, "%d", c->syscall_errno);
5939 #endif
5940                 fputc('\n', f);
5941         }
5942
5943         for (size_t i = 0; i < c->n_mount_images; i++) {
5944                 MountOptions *o;
5945
5946                 fprintf(f, "%sMountImages: %s%s:%s", prefix,
5947                         c->mount_images[i].ignore_enoent ? "-": "",
5948                         c->mount_images[i].source,
5949                         c->mount_images[i].destination);
5950                 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
5951                         fprintf(f, ":%s:%s",
5952                                 partition_designator_to_string(o->partition_designator),
5953                                 strempty(o->options));
5954                 fprintf(f, "\n");
5955         }
5956
5957         for (size_t i = 0; i < c->n_extension_images; i++) {
5958                 MountOptions *o;
5959
5960                 fprintf(f, "%sExtensionImages: %s%s", prefix,
5961                         c->extension_images[i].ignore_enoent ? "-": "",
5962                         c->extension_images[i].source);
5963                 LIST_FOREACH(mount_options, o, c->extension_images[i].mount_options)
5964                         fprintf(f, ":%s:%s",
5965                                 partition_designator_to_string(o->partition_designator),
5966                                 strempty(o->options));
5967                 fprintf(f, "\n");
5968         }
5969 }
5970
5971 bool exec_context_maintains_privileges(const ExecContext *c) {
5972         assert(c);
5973
5974         /* Returns true if the process forked off would run under
5975          * an unchanged UID or as root. */
5976
5977         if (!c->user)
5978                 return true;
5979
5980         if (streq(c->user, "root") || streq(c->user, "0"))
5981                 return true;
5982
5983         return false;
5984 }
5985
5986 int exec_context_get_effective_ioprio(const ExecContext *c) {
5987         int p;
5988
5989         assert(c);
5990
5991         if (c->ioprio_set)
5992                 return c->ioprio;
5993
5994         p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
5995         if (p < 0)
5996                 return ioprio_prio_value(IOPRIO_CLASS_BE, 4);
5997
5998         return p;
5999 }
6000
6001 bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
6002         assert(c);
6003
6004         /* Explicit setting wins */
6005         if (c->mount_apivfs_set)
6006                 return c->mount_apivfs;
6007
6008         /* Default to "yes" if root directory or image are specified */
6009         if (exec_context_with_rootfs(c))
6010                 return true;
6011
6012         return false;
6013 }
6014
6015 void exec_context_free_log_extra_fields(ExecContext *c) {
6016         assert(c);
6017
6018         for (size_t l = 0; l < c->n_log_extra_fields; l++)
6019                 free(c->log_extra_fields[l].iov_base);
6020         c->log_extra_fields = mfree(c->log_extra_fields);
6021         c->n_log_extra_fields = 0;
6022 }
6023
6024 void exec_context_revert_tty(ExecContext *c) {
6025         _cleanup_close_ int fd = -1;
6026         const char *path;
6027         struct stat st;
6028         int r;
6029
6030         assert(c);
6031
6032         /* First, reset the TTY (possibly kicking everybody else from the TTY) */
6033         exec_context_tty_reset(c, NULL);
6034
6035         /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
6036          * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
6037          * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
6038         if (!exec_context_may_touch_tty(c))
6039                 return;
6040
6041         path = exec_context_tty_path(c);
6042         if (!path)
6043                 return;
6044
6045         fd = open(path, O_PATH|O_CLOEXEC);
6046         if (fd < 0)
6047                 return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
6048                                              "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
6049                                              path);
6050
6051         if (fstat(fd, &st) < 0)
6052                 return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
6053
6054         /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
6055          * if things are a character device, since a proper check either means we'd have to open the TTY and
6056          * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
6057          * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
6058          * with this at all? → https://github.com/systemd/systemd/issues/19213 */
6059         if (!S_ISCHR(st.st_mode))
6060                 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
6061
6062         r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
6063         if (r < 0)
6064                 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
6065 }
6066
6067 int exec_context_get_clean_directories(
6068                 ExecContext *c,
6069                 char **prefix,
6070                 ExecCleanMask mask,
6071                 char ***ret) {
6072
6073         _cleanup_strv_free_ char **l = NULL;
6074         int r;
6075
6076         assert(c);
6077         assert(prefix);
6078         assert(ret);
6079
6080         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6081                 if (!FLAGS_SET(mask, 1U << t))
6082                         continue;
6083
6084                 if (!prefix[t])
6085                         continue;
6086
6087                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
6088                         char *j;
6089
6090                         j = path_join(prefix[t], c->directories[t].items[i].path);
6091                         if (!j)
6092                                 return -ENOMEM;
6093
6094                         r = strv_consume(&l, j);
6095                         if (r < 0)
6096                                 return r;
6097
6098                         /* Also remove private directories unconditionally. */
6099                         if (t != EXEC_DIRECTORY_CONFIGURATION) {
6100                                 j = path_join(prefix[t], "private", c->directories[t].items[i].path);
6101                                 if (!j)
6102                                         return -ENOMEM;
6103
6104                                 r = strv_consume(&l, j);
6105                                 if (r < 0)
6106                                         return r;
6107                         }
6108
6109                         char **symlink;
6110                         STRV_FOREACH(symlink, c->directories[t].items[i].symlinks) {
6111                                 j = path_join(prefix[t], *symlink);
6112                                 if (!j)
6113                                         return -ENOMEM;
6114
6115                                 r = strv_consume(&l, j);
6116                                 if (r < 0)
6117                                         return r;
6118                         }
6119                 }
6120         }
6121
6122         *ret = TAKE_PTR(l);
6123         return 0;
6124 }
6125
6126 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
6127         ExecCleanMask mask = 0;
6128
6129         assert(c);
6130         assert(ret);
6131
6132         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
6133                 if (c->directories[t].n_items > 0)
6134                         mask |= 1U << t;
6135
6136         *ret = mask;
6137         return 0;
6138 }
6139
6140 void exec_status_start(ExecStatus *s, pid_t pid) {
6141         assert(s);
6142
6143         *s = (ExecStatus) {
6144                 .pid = pid,
6145         };
6146
6147         dual_timestamp_get(&s->start_timestamp);
6148 }
6149
6150 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
6151         assert(s);
6152
6153         if (s->pid != pid)
6154                 *s = (ExecStatus) {
6155                         .pid = pid,
6156                 };
6157
6158         dual_timestamp_get(&s->exit_timestamp);
6159
6160         s->code = code;
6161         s->status = status;
6162
6163         if (context && context->utmp_id)
6164                 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
6165 }
6166
6167 void exec_status_reset(ExecStatus *s) {
6168         assert(s);
6169
6170         *s = (ExecStatus) {};
6171 }
6172
6173 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
6174         assert(s);
6175         assert(f);
6176
6177         if (s->pid <= 0)
6178                 return;
6179
6180         prefix = strempty(prefix);
6181
6182         fprintf(f,
6183                 "%sPID: "PID_FMT"\n",
6184                 prefix, s->pid);
6185
6186         if (dual_timestamp_is_set(&s->start_timestamp))
6187                 fprintf(f,
6188                         "%sStart Timestamp: %s\n",
6189                         prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
6190
6191         if (dual_timestamp_is_set(&s->exit_timestamp))
6192                 fprintf(f,
6193                         "%sExit Timestamp: %s\n"
6194                         "%sExit Code: %s\n"
6195                         "%sExit Status: %i\n",
6196                         prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime),
6197                         prefix, sigchld_code_to_string(s->code),
6198                         prefix, s->status);
6199 }
6200
6201 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
6202         _cleanup_free_ char *cmd = NULL;
6203         const char *prefix2;
6204
6205         assert(c);
6206         assert(f);
6207
6208         prefix = strempty(prefix);
6209         prefix2 = strjoina(prefix, "\t");
6210
6211         cmd = quote_command_line(c->argv);
6212         fprintf(f,
6213                 "%sCommand Line: %s\n",
6214                 prefix, cmd ? cmd : strerror_safe(ENOMEM));
6215
6216         exec_status_dump(&c->exec_status, f, prefix2);
6217 }
6218
6219 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
6220         assert(f);
6221
6222         prefix = strempty(prefix);
6223
6224         LIST_FOREACH(command, c, c)
6225                 exec_command_dump(c, f, prefix);
6226 }
6227
6228 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
6229         ExecCommand *end;
6230
6231         assert(l);
6232         assert(e);
6233
6234         if (*l) {
6235                 /* It's kind of important, that we keep the order here */
6236                 LIST_FIND_TAIL(command, *l, end);
6237                 LIST_INSERT_AFTER(command, *l, end, e);
6238         } else
6239               *l = e;
6240 }
6241
6242 int exec_command_set(ExecCommand *c, const char *path, ...) {
6243         va_list ap;
6244         char **l, *p;
6245
6246         assert(c);
6247         assert(path);
6248
6249         va_start(ap, path);
6250         l = strv_new_ap(path, ap);
6251         va_end(ap);
6252
6253         if (!l)
6254                 return -ENOMEM;
6255
6256         p = strdup(path);
6257         if (!p) {
6258                 strv_free(l);
6259                 return -ENOMEM;
6260         }
6261
6262         free_and_replace(c->path, p);
6263
6264         return strv_free_and_replace(c->argv, l);
6265 }
6266
6267 int exec_command_append(ExecCommand *c, const char *path, ...) {
6268         _cleanup_strv_free_ char **l = NULL;
6269         va_list ap;
6270         int r;
6271
6272         assert(c);
6273         assert(path);
6274
6275         va_start(ap, path);
6276         l = strv_new_ap(path, ap);
6277         va_end(ap);
6278
6279         if (!l)
6280                 return -ENOMEM;
6281
6282         r = strv_extend_strv(&c->argv, l, false);
6283         if (r < 0)
6284                 return r;
6285
6286         return 0;
6287 }
6288
6289 static void *remove_tmpdir_thread(void *p) {
6290         _cleanup_free_ char *path = p;
6291
6292         (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
6293         return NULL;
6294 }
6295
6296 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
6297         int r;
6298
6299         if (!rt)
6300                 return NULL;
6301
6302         if (rt->manager)
6303                 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
6304
6305         /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
6306
6307         if (destroy && rt->tmp_dir && !streq(rt->tmp_dir, RUN_SYSTEMD_EMPTY)) {
6308                 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
6309
6310                 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
6311                 if (r < 0)
6312                         log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
6313                 else
6314                         rt->tmp_dir = NULL;
6315         }
6316
6317         if (destroy && rt->var_tmp_dir && !streq(rt->var_tmp_dir, RUN_SYSTEMD_EMPTY)) {
6318                 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
6319
6320                 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
6321                 if (r < 0)
6322                         log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
6323                 else
6324                         rt->var_tmp_dir = NULL;
6325         }
6326
6327         rt->id = mfree(rt->id);
6328         rt->tmp_dir = mfree(rt->tmp_dir);
6329         rt->var_tmp_dir = mfree(rt->var_tmp_dir);
6330         safe_close_pair(rt->netns_storage_socket);
6331         safe_close_pair(rt->ipcns_storage_socket);
6332         return mfree(rt);
6333 }
6334
6335 static void exec_runtime_freep(ExecRuntime **rt) {
6336         (void) exec_runtime_free(*rt, false);
6337 }
6338
6339 static int exec_runtime_allocate(ExecRuntime **ret, const char *id) {
6340         _cleanup_free_ char *id_copy = NULL;
6341         ExecRuntime *n;
6342
6343         assert(ret);
6344
6345         id_copy = strdup(id);
6346         if (!id_copy)
6347                 return -ENOMEM;
6348
6349         n = new(ExecRuntime, 1);
6350         if (!n)
6351                 return -ENOMEM;
6352
6353         *n = (ExecRuntime) {
6354                 .id = TAKE_PTR(id_copy),
6355                 .netns_storage_socket = { -1, -1 },
6356                 .ipcns_storage_socket = { -1, -1 },
6357         };
6358
6359         *ret = n;
6360         return 0;
6361 }
6362
6363 static int exec_runtime_add(
6364                 Manager *m,
6365                 const char *id,
6366                 char **tmp_dir,
6367                 char **var_tmp_dir,
6368                 int netns_storage_socket[2],
6369                 int ipcns_storage_socket[2],
6370                 ExecRuntime **ret) {
6371
6372         _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
6373         int r;
6374
6375         assert(m);
6376         assert(id);
6377
6378         /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
6379
6380         r = exec_runtime_allocate(&rt, id);
6381         if (r < 0)
6382                 return r;
6383
6384         r = hashmap_ensure_put(&m->exec_runtime_by_id, &string_hash_ops, rt->id, rt);
6385         if (r < 0)
6386                 return r;
6387
6388         assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
6389         rt->tmp_dir = TAKE_PTR(*tmp_dir);
6390         rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
6391
6392         if (netns_storage_socket) {
6393                 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
6394                 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
6395         }
6396
6397         if (ipcns_storage_socket) {
6398                 rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
6399                 rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
6400         }
6401
6402         rt->manager = m;
6403
6404         if (ret)
6405                 *ret = rt;
6406         /* do not remove created ExecRuntime object when the operation succeeds. */
6407         TAKE_PTR(rt);
6408         return 0;
6409 }
6410
6411 static int exec_runtime_make(
6412                 Manager *m,
6413                 const ExecContext *c,
6414                 const char *id,
6415                 ExecRuntime **ret) {
6416
6417         _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
6418         _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 }, ipcns_storage_socket[2] = { -1, -1 };
6419         int r;
6420
6421         assert(m);
6422         assert(c);
6423         assert(id);
6424
6425         /* It is not necessary to create ExecRuntime object. */
6426         if (!c->private_network && !c->private_ipc && !c->private_tmp && !c->network_namespace_path) {
6427                 *ret = NULL;
6428                 return 0;
6429         }
6430
6431         if (c->private_tmp &&
6432             !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
6433               (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
6434                prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
6435                 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
6436                 if (r < 0)
6437                         return r;
6438         }
6439
6440         if (c->private_network || c->network_namespace_path) {
6441                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
6442                         return -errno;
6443         }
6444
6445         if (c->private_ipc || c->ipc_namespace_path) {
6446                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
6447                         return -errno;
6448         }
6449
6450         r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
6451         if (r < 0)
6452                 return r;
6453
6454         return 1;
6455 }
6456
6457 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
6458         ExecRuntime *rt;
6459         int r;
6460
6461         assert(m);
6462         assert(id);
6463         assert(ret);
6464
6465         rt = hashmap_get(m->exec_runtime_by_id, id);
6466         if (rt)
6467                 /* We already have an ExecRuntime object, let's increase the ref count and reuse it */
6468                 goto ref;
6469
6470         if (!create) {
6471                 *ret = NULL;
6472                 return 0;
6473         }
6474
6475         /* If not found, then create a new object. */
6476         r = exec_runtime_make(m, c, id, &rt);
6477         if (r < 0)
6478                 return r;
6479         if (r == 0) {
6480                 /* When r == 0, it is not necessary to create ExecRuntime object. */
6481                 *ret = NULL;
6482                 return 0;
6483         }
6484
6485 ref:
6486         /* increment reference counter. */
6487         rt->n_ref++;
6488         *ret = rt;
6489         return 1;
6490 }
6491
6492 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
6493         if (!rt)
6494                 return NULL;
6495
6496         assert(rt->n_ref > 0);
6497
6498         rt->n_ref--;
6499         if (rt->n_ref > 0)
6500                 return NULL;
6501
6502         return exec_runtime_free(rt, destroy);
6503 }
6504
6505 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
6506         ExecRuntime *rt;
6507
6508         assert(m);
6509         assert(f);
6510         assert(fds);
6511
6512         HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
6513                 fprintf(f, "exec-runtime=%s", rt->id);
6514
6515                 if (rt->tmp_dir)
6516                         fprintf(f, " tmp-dir=%s", rt->tmp_dir);
6517
6518                 if (rt->var_tmp_dir)
6519                         fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
6520
6521                 if (rt->netns_storage_socket[0] >= 0) {
6522                         int copy;
6523
6524                         copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
6525                         if (copy < 0)
6526                                 return copy;
6527
6528                         fprintf(f, " netns-socket-0=%i", copy);
6529                 }
6530
6531                 if (rt->netns_storage_socket[1] >= 0) {
6532                         int copy;
6533
6534                         copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
6535                         if (copy < 0)
6536                                 return copy;
6537
6538                         fprintf(f, " netns-socket-1=%i", copy);
6539                 }
6540
6541                 if (rt->ipcns_storage_socket[0] >= 0) {
6542                         int copy;
6543
6544                         copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
6545                         if (copy < 0)
6546                                 return copy;
6547
6548                         fprintf(f, " ipcns-socket-0=%i", copy);
6549                 }
6550
6551                 if (rt->ipcns_storage_socket[1] >= 0) {
6552                         int copy;
6553
6554                         copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
6555                         if (copy < 0)
6556                                 return copy;
6557
6558                         fprintf(f, " ipcns-socket-1=%i", copy);
6559                 }
6560
6561                 fputc('\n', f);
6562         }
6563
6564         return 0;
6565 }
6566
6567 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
6568         _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
6569         ExecRuntime *rt;
6570         int r;
6571
6572         /* This is for the migration from old (v237 or earlier) deserialization text.
6573          * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
6574          * Even if the ExecRuntime object originally created by the other unit, we cannot judge
6575          * so or not from the serialized text, then we always creates a new object owned by this. */
6576
6577         assert(u);
6578         assert(key);
6579         assert(value);
6580
6581         /* Manager manages ExecRuntime objects by the unit id.
6582          * So, we omit the serialized text when the unit does not have id (yet?)... */
6583         if (isempty(u->id)) {
6584                 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
6585                 return 0;
6586         }
6587
6588         if (hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops) < 0)
6589                 return log_oom();
6590
6591         rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
6592         if (!rt) {
6593                 if (exec_runtime_allocate(&rt_create, u->id) < 0)
6594                         return log_oom();
6595
6596                 rt = rt_create;
6597         }
6598
6599         if (streq(key, "tmp-dir")) {
6600                 if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
6601                         return -ENOMEM;
6602
6603         } else if (streq(key, "var-tmp-dir")) {
6604                 if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
6605                         return -ENOMEM;
6606
6607         } else if (streq(key, "netns-socket-0")) {
6608                 int fd;
6609
6610                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
6611                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
6612                         return 0;
6613                 }
6614
6615                 safe_close(rt->netns_storage_socket[0]);
6616                 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
6617
6618         } else if (streq(key, "netns-socket-1")) {
6619                 int fd;
6620
6621                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
6622                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
6623                         return 0;
6624                 }
6625
6626                 safe_close(rt->netns_storage_socket[1]);
6627                 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
6628
6629         } else
6630                 return 0;
6631
6632         /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
6633         if (rt_create) {
6634                 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
6635                 if (r < 0) {
6636                         log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
6637                         return 0;
6638                 }
6639
6640                 rt_create->manager = u->manager;
6641
6642                 /* Avoid cleanup */
6643                 TAKE_PTR(rt_create);
6644         }
6645
6646         return 1;
6647 }
6648
6649 int exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
6650         _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
6651         char *id = NULL;
6652         int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
6653         const char *p, *v = value;
6654         size_t n;
6655
6656         assert(m);
6657         assert(value);
6658         assert(fds);
6659
6660         n = strcspn(v, " ");
6661         id = strndupa_safe(v, n);
6662         if (v[n] != ' ')
6663                 goto finalize;
6664         p = v + n + 1;
6665
6666         v = startswith(p, "tmp-dir=");
6667         if (v) {
6668                 n = strcspn(v, " ");
6669                 tmp_dir = strndup(v, n);
6670                 if (!tmp_dir)
6671                         return log_oom();
6672                 if (v[n] != ' ')
6673                         goto finalize;
6674                 p = v + n + 1;
6675         }
6676
6677         v = startswith(p, "var-tmp-dir=");
6678         if (v) {
6679                 n = strcspn(v, " ");
6680                 var_tmp_dir = strndup(v, n);
6681                 if (!var_tmp_dir)
6682                         return log_oom();
6683                 if (v[n] != ' ')
6684                         goto finalize;
6685                 p = v + n + 1;
6686         }
6687
6688         v = startswith(p, "netns-socket-0=");
6689         if (v) {
6690                 char *buf;
6691
6692                 n = strcspn(v, " ");
6693                 buf = strndupa_safe(v, n);
6694
6695                 r = safe_atoi(buf, &netns_fdpair[0]);
6696                 if (r < 0)
6697                         return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
6698                 if (!fdset_contains(fds, netns_fdpair[0]))
6699                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6700                                                "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair[0]);
6701                 netns_fdpair[0] = fdset_remove(fds, netns_fdpair[0]);
6702                 if (v[n] != ' ')
6703                         goto finalize;
6704                 p = v + n + 1;
6705         }
6706
6707         v = startswith(p, "netns-socket-1=");
6708         if (v) {
6709                 char *buf;
6710
6711                 n = strcspn(v, " ");
6712                 buf = strndupa_safe(v, n);
6713
6714                 r = safe_atoi(buf, &netns_fdpair[1]);
6715                 if (r < 0)
6716                         return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
6717                 if (!fdset_contains(fds, netns_fdpair[1]))
6718                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6719                                                "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair[1]);
6720                 netns_fdpair[1] = fdset_remove(fds, netns_fdpair[1]);
6721                 if (v[n] != ' ')
6722                         goto finalize;
6723                 p = v + n + 1;
6724         }
6725
6726         v = startswith(p, "ipcns-socket-0=");
6727         if (v) {
6728                 char *buf;
6729
6730                 n = strcspn(v, " ");
6731                 buf = strndupa_safe(v, n);
6732
6733                 r = safe_atoi(buf, &ipcns_fdpair[0]);
6734                 if (r < 0)
6735                         return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf);
6736                 if (!fdset_contains(fds, ipcns_fdpair[0]))
6737                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6738                                                "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair[0]);
6739                 ipcns_fdpair[0] = fdset_remove(fds, ipcns_fdpair[0]);
6740                 if (v[n] != ' ')
6741                         goto finalize;
6742                 p = v + n + 1;
6743         }
6744
6745         v = startswith(p, "ipcns-socket-1=");
6746         if (v) {
6747                 char *buf;
6748
6749                 n = strcspn(v, " ");
6750                 buf = strndupa_safe(v, n);
6751
6752                 r = safe_atoi(buf, &ipcns_fdpair[1]);
6753                 if (r < 0)
6754                         return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf);
6755                 if (!fdset_contains(fds, ipcns_fdpair[1]))
6756                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6757                                                "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair[1]);
6758                 ipcns_fdpair[1] = fdset_remove(fds, ipcns_fdpair[1]);
6759         }
6760
6761 finalize:
6762         r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
6763         if (r < 0)
6764                 return log_debug_errno(r, "Failed to add exec-runtime: %m");
6765         return 0;
6766 }
6767
6768 void exec_runtime_vacuum(Manager *m) {
6769         ExecRuntime *rt;
6770
6771         assert(m);
6772
6773         /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
6774
6775         HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
6776                 if (rt->n_ref > 0)
6777                         continue;
6778
6779                 (void) exec_runtime_free(rt, false);
6780         }
6781 }
6782
6783 void exec_params_clear(ExecParameters *p) {
6784         if (!p)
6785                 return;
6786
6787         p->environment = strv_free(p->environment);
6788         p->fd_names = strv_free(p->fd_names);
6789         p->fds = mfree(p->fds);
6790         p->exec_fd = safe_close(p->exec_fd);
6791 }
6792
6793 ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) {
6794         if (!sc)
6795                 return NULL;
6796
6797         free(sc->id);
6798         free(sc->data);
6799         return mfree(sc);
6800 }
6801
6802 ExecLoadCredential *exec_load_credential_free(ExecLoadCredential *lc) {
6803         if (!lc)
6804                 return NULL;
6805
6806         free(lc->id);
6807         free(lc->path);
6808         return mfree(lc);
6809 }
6810
6811 void exec_directory_done(ExecDirectory *d) {
6812         if (!d)
6813                 return;
6814
6815         for (size_t i = 0; i < d->n_items; i++) {
6816                 free(d->items[i].path);
6817                 strv_free(d->items[i].symlinks);
6818         }
6819
6820         d->items = mfree(d->items);
6821         d->n_items = 0;
6822         d->mode = 0755;
6823 }
6824
6825 int exec_directory_add(ExecDirectoryItem **d, size_t *n, const char *path, char **symlinks) {
6826         _cleanup_strv_free_ char **s = NULL;
6827         _cleanup_free_ char *p = NULL;
6828
6829         assert(d);
6830         assert(n);
6831         assert(path);
6832
6833         p = strdup(path);
6834         if (!p)
6835                 return -ENOMEM;
6836
6837         if (symlinks) {
6838                 s = strv_copy(symlinks);
6839                 if (!s)
6840                         return -ENOMEM;
6841         }
6842
6843         if (!GREEDY_REALLOC(*d, *n + 1))
6844                 return -ENOMEM;
6845
6846         (*d)[(*n) ++] = (ExecDirectoryItem) {
6847                 .path = TAKE_PTR(p),
6848                 .symlinks = TAKE_PTR(s),
6849         };
6850
6851         return 0;
6852 }
6853
6854 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_set_credential_hash_ops, char, string_hash_func, string_compare_func, ExecSetCredential, exec_set_credential_free);
6855 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_load_credential_hash_ops, char, string_hash_func, string_compare_func, ExecLoadCredential, exec_load_credential_free);
6856
6857 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
6858         [EXEC_INPUT_NULL] = "null",
6859         [EXEC_INPUT_TTY] = "tty",
6860         [EXEC_INPUT_TTY_FORCE] = "tty-force",
6861         [EXEC_INPUT_TTY_FAIL] = "tty-fail",
6862         [EXEC_INPUT_SOCKET] = "socket",
6863         [EXEC_INPUT_NAMED_FD] = "fd",
6864         [EXEC_INPUT_DATA] = "data",
6865         [EXEC_INPUT_FILE] = "file",
6866 };
6867
6868 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
6869
6870 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
6871         [EXEC_OUTPUT_INHERIT] = "inherit",
6872         [EXEC_OUTPUT_NULL] = "null",
6873         [EXEC_OUTPUT_TTY] = "tty",
6874         [EXEC_OUTPUT_KMSG] = "kmsg",
6875         [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
6876         [EXEC_OUTPUT_JOURNAL] = "journal",
6877         [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
6878         [EXEC_OUTPUT_SOCKET] = "socket",
6879         [EXEC_OUTPUT_NAMED_FD] = "fd",
6880         [EXEC_OUTPUT_FILE] = "file",
6881         [EXEC_OUTPUT_FILE_APPEND] = "append",
6882         [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
6883 };
6884
6885 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
6886
6887 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
6888         [EXEC_UTMP_INIT] = "init",
6889         [EXEC_UTMP_LOGIN] = "login",
6890         [EXEC_UTMP_USER] = "user",
6891 };
6892
6893 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
6894
6895 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
6896         [EXEC_PRESERVE_NO] = "no",
6897         [EXEC_PRESERVE_YES] = "yes",
6898         [EXEC_PRESERVE_RESTART] = "restart",
6899 };
6900
6901 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
6902
6903 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
6904 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
6905         [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
6906         [EXEC_DIRECTORY_STATE] = "StateDirectory",
6907         [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
6908         [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
6909         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
6910 };
6911
6912 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
6913
6914 /* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
6915 static const char* const exec_directory_type_symlink_table[_EXEC_DIRECTORY_TYPE_MAX] = {
6916         [EXEC_DIRECTORY_RUNTIME]       = "RuntimeDirectorySymlink",
6917         [EXEC_DIRECTORY_STATE]         = "StateDirectorySymlink",
6918         [EXEC_DIRECTORY_CACHE]         = "CacheDirectorySymlink",
6919         [EXEC_DIRECTORY_LOGS]          = "LogsDirectorySymlink",
6920         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectorySymlink",
6921 };
6922
6923 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink, ExecDirectoryType);
6924
6925 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
6926  * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
6927  * directories, specifically .timer units with their timestamp touch file. */
6928 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
6929         [EXEC_DIRECTORY_RUNTIME] = "runtime",
6930         [EXEC_DIRECTORY_STATE] = "state",
6931         [EXEC_DIRECTORY_CACHE] = "cache",
6932         [EXEC_DIRECTORY_LOGS] = "logs",
6933         [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
6934 };
6935
6936 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
6937
6938 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
6939  * the service payload in. */
6940 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
6941         [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
6942         [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
6943         [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
6944         [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
6945         [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
6946 };
6947
6948 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
6949
6950 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
6951         [EXEC_KEYRING_INHERIT] = "inherit",
6952         [EXEC_KEYRING_PRIVATE] = "private",
6953         [EXEC_KEYRING_SHARED] = "shared",
6954 };
6955
6956 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);