src/core/execute.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <errno.h>
   4 #include <fcntl.h>
   5 #include <poll.h>
   6 #include <sys/eventfd.h>
   7 #include <sys/ioctl.h>
   8 #include <sys/mman.h>
   9 #include <sys/mount.h>
  10 #include <sys/personality.h>
  11 #include <sys/prctl.h>
  12 #include <sys/shm.h>
  13 #include <sys/types.h>
  14 #include <sys/un.h>
  15 #include <unistd.h>
  16 #include <utmpx.h>
  17
  18 #if HAVE_PAM
  19 #include <security/pam_appl.h>
  20 #endif
  21
  22 #if HAVE_SELINUX
  23 #include <selinux/selinux.h>
  24 #endif
  25
  26 #if HAVE_SECCOMP
  27 #include <seccomp.h>
  28 #endif
  29
  30 #if HAVE_APPARMOR
  31 #include <sys/apparmor.h>
  32 #endif
  33
  34 #include "sd-messages.h"
  35
  36 #include "acl-util.h"
  37 #include "af-list.h"
  38 #include "alloc-util.h"
  39 #if HAVE_APPARMOR
  40 #include "apparmor-util.h"
  41 #endif
  42 #include "async.h"
  43 #include "barrier.h"
  44 #include "cap-list.h"
  45 #include "capability-util.h"
  46 #include "cgroup-setup.h"
  47 #include "chown-recursive.h"
  48 #include "cpu-set-util.h"
  49 #include "def.h"
  50 #include "env-file.h"
  51 #include "env-util.h"
  52 #include "errno-list.h"
  53 #include "execute.h"
  54 #include "exit-status.h"
  55 #include "fd-util.h"
  56 #include "fileio.h"
  57 #include "format-util.h"
  58 #include "fs-util.h"
  59 #include "glob-util.h"
  60 #include "hexdecoct.h"
  61 #include "io-util.h"
  62 #include "ioprio.h"
  63 #include "label.h"
  64 #include "log.h"
  65 #include "macro.h"
  66 #include "manager.h"
  67 #include "memory-util.h"
  68 #include "missing_fs.h"
  69 #include "mkdir.h"
  70 #include "mount-util.h"
  71 #include "mountpoint-util.h"
  72 #include "namespace.h"
  73 #include "parse-util.h"
  74 #include "path-util.h"
  75 #include "process-util.h"
  76 #include "random-util.h"
  77 #include "rlimit-util.h"
  78 #include "rm-rf.h"
  79 #if HAVE_SECCOMP
  80 #include "seccomp-util.h"
  81 #endif
  82 #include "securebits-util.h"
  83 #include "selinux-util.h"
  84 #include "signal-util.h"
  85 #include "smack-util.h"
  86 #include "socket-util.h"
  87 #include "special.h"
  88 #include "stat-util.h"
  89 #include "string-table.h"
  90 #include "string-util.h"
  91 #include "strv.h"
  92 #include "syslog-util.h"
  93 #include "terminal-util.h"
  94 #include "tmpfile-util.h"
  95 #include "umask-util.h"
  96 #include "unit-serialize.h"
  97 #include "user-util.h"
  98 #include "utmp-wtmp.h"
  99
 100 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
 101 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
 102
 103 #define SNDBUF_SIZE (8*1024*1024)
 104
 105 static int shift_fds(int fds[], size_t n_fds) {
 106         if (n_fds <= 0)
 107                 return 0;
 108
 109         /* Modifies the fds array! (sorts it) */
 110
 111         assert(fds);
 112
 113         for (int start = 0;;) {
 114                 int restart_from = -1;
 115
 116                 for (int i = start; i < (int) n_fds; i++) {
 117                         int nfd;
 118
 119                         /* Already at right index? */
 120                         if (fds[i] == i+3)
 121                                 continue;
 122
 123                         nfd = fcntl(fds[i], F_DUPFD, i + 3);
 124                         if (nfd < 0)
 125                                 return -errno;
 126
 127                         safe_close(fds[i]);
 128                         fds[i] = nfd;
 129
 130                         /* Hmm, the fd we wanted isn't free? Then
 131                          * let's remember that and try again from here */
 132                         if (nfd != i+3 && restart_from < 0)
 133                                 restart_from = i;
 134                 }
 135
 136                 if (restart_from < 0)
 137                         break;
 138
 139                 start = restart_from;
 140         }
 141
 142         return 0;
 143 }
 144
 145 static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
 146         size_t n_fds;
 147         int r;
 148
 149         n_fds = n_socket_fds + n_storage_fds;
 150         if (n_fds <= 0)
 151                 return 0;
 152
 153         assert(fds);
 154
 155         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
 156          * O_NONBLOCK only applies to socket activation though. */
 157
 158         for (size_t i = 0; i < n_fds; i++) {
 159
 160                 if (i < n_socket_fds) {
 161                         r = fd_nonblock(fds[i], nonblock);
 162                         if (r < 0)
 163                                 return r;
 164                 }
 165
 166                 /* We unconditionally drop FD_CLOEXEC from the fds,
 167                  * since after all we want to pass these fds to our
 168                  * children */
 169
 170                 r = fd_cloexec(fds[i], false);
 171                 if (r < 0)
 172                         return r;
 173         }
 174
 175         return 0;
 176 }
 177
 178 static const char *exec_context_tty_path(const ExecContext *context) {
 179         assert(context);
 180
 181         if (context->stdio_as_fds)
 182                 return NULL;
 183
 184         if (context->tty_path)
 185                 return context->tty_path;
 186
 187         return "/dev/console";
 188 }
 189
 190 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
 191         const char *path;
 192
 193         assert(context);
 194
 195         path = exec_context_tty_path(context);
 196
 197         if (context->tty_vhangup) {
 198                 if (p && p->stdin_fd >= 0)
 199                         (void) terminal_vhangup_fd(p->stdin_fd);
 200                 else if (path)
 201                         (void) terminal_vhangup(path);
 202         }
 203
 204         if (context->tty_reset) {
 205                 if (p && p->stdin_fd >= 0)
 206                         (void) reset_terminal_fd(p->stdin_fd, true);
 207                 else if (path)
 208                         (void) reset_terminal(path);
 209         }
 210
 211         if (context->tty_vt_disallocate && path)
 212                 (void) vt_disallocate(path);
 213 }
 214
 215 static bool is_terminal_input(ExecInput i) {
 216         return IN_SET(i,
 217                       EXEC_INPUT_TTY,
 218                       EXEC_INPUT_TTY_FORCE,
 219                       EXEC_INPUT_TTY_FAIL);
 220 }
 221
 222 static bool is_terminal_output(ExecOutput o) {
 223         return IN_SET(o,
 224                       EXEC_OUTPUT_TTY,
 225                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 226                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 227 }
 228
 229 static bool is_kmsg_output(ExecOutput o) {
 230         return IN_SET(o,
 231                       EXEC_OUTPUT_KMSG,
 232                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 233 }
 234
 235 static bool exec_context_needs_term(const ExecContext *c) {
 236         assert(c);
 237
 238         /* Return true if the execution context suggests we should set $TERM to something useful. */
 239
 240         if (is_terminal_input(c->std_input))
 241                 return true;
 242
 243         if (is_terminal_output(c->std_output))
 244                 return true;
 245
 246         if (is_terminal_output(c->std_error))
 247                 return true;
 248
 249         return !!c->tty_path;
 250 }
 251
 252 static int open_null_as(int flags, int nfd) {
 253         int fd;
 254
 255         assert(nfd >= 0);
 256
 257         fd = open("/dev/null", flags|O_NOCTTY);
 258         if (fd < 0)
 259                 return -errno;
 260
 261         return move_fd(fd, nfd, false);
 262 }
 263
 264 static int connect_journal_socket(
 265                 int fd,
 266                 const char *log_namespace,
 267                 uid_t uid,
 268                 gid_t gid) {
 269
 270         union sockaddr_union sa;
 271         socklen_t sa_len;
 272         uid_t olduid = UID_INVALID;
 273         gid_t oldgid = GID_INVALID;
 274         const char *j;
 275         int r;
 276
 277         j = log_namespace ?
 278                 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
 279                 "/run/systemd/journal/stdout";
 280         r = sockaddr_un_set_path(&sa.un, j);
 281         if (r < 0)
 282                 return r;
 283         sa_len = r;
 284
 285         if (gid_is_valid(gid)) {
 286                 oldgid = getgid();
 287
 288                 if (setegid(gid) < 0)
 289                         return -errno;
 290         }
 291
 292         if (uid_is_valid(uid)) {
 293                 olduid = getuid();
 294
 295                 if (seteuid(uid) < 0) {
 296                         r = -errno;
 297                         goto restore_gid;
 298                 }
 299         }
 300
 301         r = connect(fd, &sa.sa, sa_len) < 0 ? -errno : 0;
 302
 303         /* If we fail to restore the uid or gid, things will likely
 304            fail later on. This should only happen if an LSM interferes. */
 305
 306         if (uid_is_valid(uid))
 307                 (void) seteuid(olduid);
 308
 309  restore_gid:
 310         if (gid_is_valid(gid))
 311                 (void) setegid(oldgid);
 312
 313         return r;
 314 }
 315
 316 static int connect_logger_as(
 317                 const Unit *unit,
 318                 const ExecContext *context,
 319                 const ExecParameters *params,
 320                 ExecOutput output,
 321                 const char *ident,
 322                 int nfd,
 323                 uid_t uid,
 324                 gid_t gid) {
 325
 326         _cleanup_close_ int fd = -1;
 327         int r;
 328
 329         assert(context);
 330         assert(params);
 331         assert(output < _EXEC_OUTPUT_MAX);
 332         assert(ident);
 333         assert(nfd >= 0);
 334
 335         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 336         if (fd < 0)
 337                 return -errno;
 338
 339         r = connect_journal_socket(fd, context->log_namespace, uid, gid);
 340         if (r < 0)
 341                 return r;
 342
 343         if (shutdown(fd, SHUT_RD) < 0)
 344                 return -errno;
 345
 346         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 347
 348         if (dprintf(fd,
 349                 "%s\n"
 350                 "%s\n"
 351                 "%i\n"
 352                 "%i\n"
 353                 "%i\n"
 354                 "%i\n"
 355                 "%i\n",
 356                 context->syslog_identifier ?: ident,
 357                 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
 358                 context->syslog_priority,
 359                 !!context->syslog_level_prefix,
 360                 false,
 361                 is_kmsg_output(output),
 362                 is_terminal_output(output)) < 0)
 363                 return -errno;
 364
 365         return move_fd(TAKE_FD(fd), nfd, false);
 366 }
 367
 368 static int open_terminal_as(const char *path, int flags, int nfd) {
 369         int fd;
 370
 371         assert(path);
 372         assert(nfd >= 0);
 373
 374         fd = open_terminal(path, flags | O_NOCTTY);
 375         if (fd < 0)
 376                 return fd;
 377
 378         return move_fd(fd, nfd, false);
 379 }
 380
 381 static int acquire_path(const char *path, int flags, mode_t mode) {
 382         union sockaddr_union sa;
 383         socklen_t sa_len;
 384         _cleanup_close_ int fd = -1;
 385         int r;
 386
 387         assert(path);
 388
 389         if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
 390                 flags |= O_CREAT;
 391
 392         fd = open(path, flags|O_NOCTTY, mode);
 393         if (fd >= 0)
 394                 return TAKE_FD(fd);
 395
 396         if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
 397                 return -errno;
 398
 399         /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
 400
 401         r = sockaddr_un_set_path(&sa.un, path);
 402         if (r < 0)
 403                 return r == -EINVAL ? -ENXIO : r;
 404         sa_len = r;
 405
 406         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 407         if (fd < 0)
 408                 return -errno;
 409
 410         if (connect(fd, &sa.sa, sa_len) < 0)
 411                 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
 412                                                            * indication that this wasn't an AF_UNIX socket after all */
 413
 414         if ((flags & O_ACCMODE) == O_RDONLY)
 415                 r = shutdown(fd, SHUT_WR);
 416         else if ((flags & O_ACCMODE) == O_WRONLY)
 417                 r = shutdown(fd, SHUT_RD);
 418         else
 419                 r = 0;
 420         if (r < 0)
 421                 return -errno;
 422
 423         return TAKE_FD(fd);
 424 }
 425
 426 static int fixup_input(
 427                 const ExecContext *context,
 428                 int socket_fd,
 429                 bool apply_tty_stdin) {
 430
 431         ExecInput std_input;
 432
 433         assert(context);
 434
 435         std_input = context->std_input;
 436
 437         if (is_terminal_input(std_input) && !apply_tty_stdin)
 438                 return EXEC_INPUT_NULL;
 439
 440         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 441                 return EXEC_INPUT_NULL;
 442
 443         if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
 444                 return EXEC_INPUT_NULL;
 445
 446         return std_input;
 447 }
 448
 449 static int fixup_output(ExecOutput std_output, int socket_fd) {
 450
 451         if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 452                 return EXEC_OUTPUT_INHERIT;
 453
 454         return std_output;
 455 }
 456
 457 static int setup_input(
 458                 const ExecContext *context,
 459                 const ExecParameters *params,
 460                 int socket_fd,
 461                 const int named_iofds[static 3]) {
 462
 463         ExecInput i;
 464
 465         assert(context);
 466         assert(params);
 467         assert(named_iofds);
 468
 469         if (params->stdin_fd >= 0) {
 470                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 471                         return -errno;
 472
 473                 /* Try to make this the controlling tty, if it is a tty, and reset it */
 474                 if (isatty(STDIN_FILENO)) {
 475                         (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 476                         (void) reset_terminal_fd(STDIN_FILENO, true);
 477                 }
 478
 479                 return STDIN_FILENO;
 480         }
 481
 482         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 483
 484         switch (i) {
 485
 486         case EXEC_INPUT_NULL:
 487                 return open_null_as(O_RDONLY, STDIN_FILENO);
 488
 489         case EXEC_INPUT_TTY:
 490         case EXEC_INPUT_TTY_FORCE:
 491         case EXEC_INPUT_TTY_FAIL: {
 492                 int fd;
 493
 494                 fd = acquire_terminal(exec_context_tty_path(context),
 495                                       i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
 496                                       i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
 497                                                                   ACQUIRE_TERMINAL_WAIT,
 498                                       USEC_INFINITY);
 499                 if (fd < 0)
 500                         return fd;
 501
 502                 return move_fd(fd, STDIN_FILENO, false);
 503         }
 504
 505         case EXEC_INPUT_SOCKET:
 506                 assert(socket_fd >= 0);
 507
 508                 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 509
 510         case EXEC_INPUT_NAMED_FD:
 511                 assert(named_iofds[STDIN_FILENO] >= 0);
 512
 513                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 514                 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 515
 516         case EXEC_INPUT_DATA: {
 517                 int fd;
 518
 519                 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
 520                 if (fd < 0)
 521                         return fd;
 522
 523                 return move_fd(fd, STDIN_FILENO, false);
 524         }
 525
 526         case EXEC_INPUT_FILE: {
 527                 bool rw;
 528                 int fd;
 529
 530                 assert(context->stdio_file[STDIN_FILENO]);
 531
 532                 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
 533                         (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
 534
 535                 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
 536                 if (fd < 0)
 537                         return fd;
 538
 539                 return move_fd(fd, STDIN_FILENO, false);
 540         }
 541
 542         default:
 543                 assert_not_reached("Unknown input type");
 544         }
 545 }
 546
 547 static bool can_inherit_stderr_from_stdout(
 548                 const ExecContext *context,
 549                 ExecOutput o,
 550                 ExecOutput e) {
 551
 552         assert(context);
 553
 554         /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
 555          * stderr fd */
 556
 557         if (e == EXEC_OUTPUT_INHERIT)
 558                 return true;
 559         if (e != o)
 560                 return false;
 561
 562         if (e == EXEC_OUTPUT_NAMED_FD)
 563                 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
 564
 565         if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
 566                 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
 567
 568         return true;
 569 }
 570
 571 static int setup_output(
 572                 const Unit *unit,
 573                 const ExecContext *context,
 574                 const ExecParameters *params,
 575                 int fileno,
 576                 int socket_fd,
 577                 const int named_iofds[static 3],
 578                 const char *ident,
 579                 uid_t uid,
 580                 gid_t gid,
 581                 dev_t *journal_stream_dev,
 582                 ino_t *journal_stream_ino) {
 583
 584         ExecOutput o;
 585         ExecInput i;
 586         int r;
 587
 588         assert(unit);
 589         assert(context);
 590         assert(params);
 591         assert(ident);
 592         assert(journal_stream_dev);
 593         assert(journal_stream_ino);
 594
 595         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 596
 597                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 598                         return -errno;
 599
 600                 return STDOUT_FILENO;
 601         }
 602
 603         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 604                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 605                         return -errno;
 606
 607                 return STDERR_FILENO;
 608         }
 609
 610         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 611         o = fixup_output(context->std_output, socket_fd);
 612
 613         if (fileno == STDERR_FILENO) {
 614                 ExecOutput e;
 615                 e = fixup_output(context->std_error, socket_fd);
 616
 617                 /* This expects the input and output are already set up */
 618
 619                 /* Don't change the stderr file descriptor if we inherit all
 620                  * the way and are not on a tty */
 621                 if (e == EXEC_OUTPUT_INHERIT &&
 622                     o == EXEC_OUTPUT_INHERIT &&
 623                     i == EXEC_INPUT_NULL &&
 624                     !is_terminal_input(context->std_input) &&
 625                     getppid () != 1)
 626                         return fileno;
 627
 628                 /* Duplicate from stdout if possible */
 629                 if (can_inherit_stderr_from_stdout(context, o, e))
 630                         return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
 631
 632                 o = e;
 633
 634         } else if (o == EXEC_OUTPUT_INHERIT) {
 635                 /* If input got downgraded, inherit the original value */
 636                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 637                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 638
 639                 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
 640                 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
 641                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 642
 643                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 644                 if (getppid() != 1)
 645                         return fileno;
 646
 647                 /* We need to open /dev/null here anew, to get the right access mode. */
 648                 return open_null_as(O_WRONLY, fileno);
 649         }
 650
 651         switch (o) {
 652
 653         case EXEC_OUTPUT_NULL:
 654                 return open_null_as(O_WRONLY, fileno);
 655
 656         case EXEC_OUTPUT_TTY:
 657                 if (is_terminal_input(i))
 658                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 659
 660                 /* We don't reset the terminal if this is just about output */
 661                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 662
 663         case EXEC_OUTPUT_KMSG:
 664         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 665         case EXEC_OUTPUT_JOURNAL:
 666         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 667                 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
 668                 if (r < 0) {
 669                         log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
 670                         r = open_null_as(O_WRONLY, fileno);
 671                 } else {
 672                         struct stat st;
 673
 674                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 675                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 676                          * services to detect whether they are connected to the journal or not.
 677                          *
 678                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
 679                          * about STDERR as that's usually the best way to do logging. */
 680
 681                         if (fstat(fileno, &st) >= 0 &&
 682                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
 683                                 *journal_stream_dev = st.st_dev;
 684                                 *journal_stream_ino = st.st_ino;
 685                         }
 686                 }
 687                 return r;
 688
 689         case EXEC_OUTPUT_SOCKET:
 690                 assert(socket_fd >= 0);
 691
 692                 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
 693
 694         case EXEC_OUTPUT_NAMED_FD:
 695                 assert(named_iofds[fileno] >= 0);
 696
 697                 (void) fd_nonblock(named_iofds[fileno], false);
 698                 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
 699
 700         case EXEC_OUTPUT_FILE:
 701         case EXEC_OUTPUT_FILE_APPEND:
 702         case EXEC_OUTPUT_FILE_TRUNCATE: {
 703                 bool rw;
 704                 int fd, flags;
 705
 706                 assert(context->stdio_file[fileno]);
 707
 708                 rw = context->std_input == EXEC_INPUT_FILE &&
 709                         streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
 710
 711                 if (rw)
 712                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 713
 714                 flags = O_WRONLY;
 715                 if (o == EXEC_OUTPUT_FILE_APPEND)
 716                         flags |= O_APPEND;
 717                 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
 718                         flags |= O_TRUNC;
 719
 720                 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
 721                 if (fd < 0)
 722                         return fd;
 723
 724                 return move_fd(fd, fileno, 0);
 725         }
 726
 727         default:
 728                 assert_not_reached("Unknown error type");
 729         }
 730 }
 731
 732 static int chown_terminal(int fd, uid_t uid) {
 733         int r;
 734
 735         assert(fd >= 0);
 736
 737         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 738         if (isatty(fd) < 1) {
 739                 if (IN_SET(errno, EINVAL, ENOTTY))
 740                         return 0; /* not a tty */
 741
 742                 return -errno;
 743         }
 744
 745         /* This might fail. What matters are the results. */
 746         r = fchmod_and_chown(fd, TTY_MODE, uid, -1);
 747         if (r < 0)
 748                 return r;
 749
 750         return 1;
 751 }
 752
 753 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
 754         _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
 755         int r;
 756
 757         assert(_saved_stdin);
 758         assert(_saved_stdout);
 759
 760         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 761         if (saved_stdin < 0)
 762                 return -errno;
 763
 764         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
 765         if (saved_stdout < 0)
 766                 return -errno;
 767
 768         fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
 769         if (fd < 0)
 770                 return fd;
 771
 772         r = chown_terminal(fd, getuid());
 773         if (r < 0)
 774                 return r;
 775
 776         r = reset_terminal_fd(fd, true);
 777         if (r < 0)
 778                 return r;
 779
 780         r = rearrange_stdio(fd, fd, STDERR_FILENO);
 781         fd = -1;
 782         if (r < 0)
 783                 return r;
 784
 785         *_saved_stdin = saved_stdin;
 786         *_saved_stdout = saved_stdout;
 787
 788         saved_stdin = saved_stdout = -1;
 789
 790         return 0;
 791 }
 792
 793 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
 794         assert(err < 0);
 795
 796         if (err == -ETIMEDOUT)
 797                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
 798         else {
 799                 errno = -err;
 800                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
 801         }
 802 }
 803
 804 static void write_confirm_error(int err, const char *vc, const Unit *u) {
 805         _cleanup_close_ int fd = -1;
 806
 807         assert(vc);
 808
 809         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 810         if (fd < 0)
 811                 return;
 812
 813         write_confirm_error_fd(err, fd, u);
 814 }
 815
 816 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 817         int r = 0;
 818
 819         assert(saved_stdin);
 820         assert(saved_stdout);
 821
 822         release_terminal();
 823
 824         if (*saved_stdin >= 0)
 825                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 826                         r = -errno;
 827
 828         if (*saved_stdout >= 0)
 829                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 830                         r = -errno;
 831
 832         *saved_stdin = safe_close(*saved_stdin);
 833         *saved_stdout = safe_close(*saved_stdout);
 834
 835         return r;
 836 }
 837
 838 enum {
 839         CONFIRM_PRETEND_FAILURE = -1,
 840         CONFIRM_PRETEND_SUCCESS =  0,
 841         CONFIRM_EXECUTE = 1,
 842 };
 843
 844 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
 845         int saved_stdout = -1, saved_stdin = -1, r;
 846         _cleanup_free_ char *e = NULL;
 847         char c;
 848
 849         /* For any internal errors, assume a positive response. */
 850         r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
 851         if (r < 0) {
 852                 write_confirm_error(r, vc, u);
 853                 return CONFIRM_EXECUTE;
 854         }
 855
 856         /* confirm_spawn might have been disabled while we were sleeping. */
 857         if (manager_is_confirm_spawn_disabled(u->manager)) {
 858                 r = 1;
 859                 goto restore_stdio;
 860         }
 861
 862         e = ellipsize(cmdline, 60, 100);
 863         if (!e) {
 864                 log_oom();
 865                 r = CONFIRM_EXECUTE;
 866                 goto restore_stdio;
 867         }
 868
 869         for (;;) {
 870                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 871                 if (r < 0) {
 872                         write_confirm_error_fd(r, STDOUT_FILENO, u);
 873                         r = CONFIRM_EXECUTE;
 874                         goto restore_stdio;
 875                 }
 876
 877                 switch (c) {
 878                 case 'c':
 879                         printf("Resuming normal execution.\n");
 880                         manager_disable_confirm_spawn();
 881                         r = 1;
 882                         break;
 883                 case 'D':
 884                         unit_dump(u, stdout, "  ");
 885                         continue; /* ask again */
 886                 case 'f':
 887                         printf("Failing execution.\n");
 888                         r = CONFIRM_PRETEND_FAILURE;
 889                         break;
 890                 case 'h':
 891                         printf("  c - continue, proceed without asking anymore\n"
 892                                "  D - dump, show the state of the unit\n"
 893                                "  f - fail, don't execute the command and pretend it failed\n"
 894                                "  h - help\n"
 895                                "  i - info, show a short summary of the unit\n"
 896                                "  j - jobs, show jobs that are in progress\n"
 897                                "  s - skip, don't execute the command and pretend it succeeded\n"
 898                                "  y - yes, execute the command\n");
 899                         continue; /* ask again */
 900                 case 'i':
 901                         printf("  Description: %s\n"
 902                                "  Unit:        %s\n"
 903                                "  Command:     %s\n",
 904                                u->id, u->description, cmdline);
 905                         continue; /* ask again */
 906                 case 'j':
 907                         manager_dump_jobs(u->manager, stdout, "  ");
 908                         continue; /* ask again */
 909                 case 'n':
 910                         /* 'n' was removed in favor of 'f'. */
 911                         printf("Didn't understand 'n', did you mean 'f'?\n");
 912                         continue; /* ask again */
 913                 case 's':
 914                         printf("Skipping execution.\n");
 915                         r = CONFIRM_PRETEND_SUCCESS;
 916                         break;
 917                 case 'y':
 918                         r = CONFIRM_EXECUTE;
 919                         break;
 920                 default:
 921                         assert_not_reached("Unhandled choice");
 922                 }
 923                 break;
 924         }
 925
 926 restore_stdio:
 927         restore_confirm_stdio(&saved_stdin, &saved_stdout);
 928         return r;
 929 }
 930
 931 static int get_fixed_user(const ExecContext *c, const char **user,
 932                           uid_t *uid, gid_t *gid,
 933                           const char **home, const char **shell) {
 934         int r;
 935         const char *name;
 936
 937         assert(c);
 938
 939         if (!c->user)
 940                 return 0;
 941
 942         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
 943          * (i.e. are "/" or "/bin/nologin"). */
 944
 945         name = c->user;
 946         r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
 947         if (r < 0)
 948                 return r;
 949
 950         *user = name;
 951         return 0;
 952 }
 953
 954 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
 955         int r;
 956         const char *name;
 957
 958         assert(c);
 959
 960         if (!c->group)
 961                 return 0;
 962
 963         name = c->group;
 964         r = get_group_creds(&name, gid, 0);
 965         if (r < 0)
 966                 return r;
 967
 968         *group = name;
 969         return 0;
 970 }
 971
 972 static int get_supplementary_groups(const ExecContext *c, const char *user,
 973                                     const char *group, gid_t gid,
 974                                     gid_t **supplementary_gids, int *ngids) {
 975         char **i;
 976         int r, k = 0;
 977         int ngroups_max;
 978         bool keep_groups = false;
 979         gid_t *groups = NULL;
 980         _cleanup_free_ gid_t *l_gids = NULL;
 981
 982         assert(c);
 983
 984         /*
 985          * If user is given, then lookup GID and supplementary groups list.
 986          * We avoid NSS lookups for gid=0. Also we have to initialize groups
 987          * here and as early as possible so we keep the list of supplementary
 988          * groups of the caller.
 989          */
 990         if (user && gid_is_valid(gid) && gid != 0) {
 991                 /* First step, initialize groups from /etc/groups */
 992                 if (initgroups(user, gid) < 0)
 993                         return -errno;
 994
 995                 keep_groups = true;
 996         }
 997
 998         if (strv_isempty(c->supplementary_groups))
 999                 return 0;
1000
1001         /*
1002          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1003          * be positive, otherwise fail.
1004          */
1005         errno = 0;
1006         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1007         if (ngroups_max <= 0)
1008                 return errno_or_else(EOPNOTSUPP);
1009
1010         l_gids = new(gid_t, ngroups_max);
1011         if (!l_gids)
1012                 return -ENOMEM;
1013
1014         if (keep_groups) {
1015                 /*
1016                  * Lookup the list of groups that the user belongs to, we
1017                  * avoid NSS lookups here too for gid=0.
1018                  */
1019                 k = ngroups_max;
1020                 if (getgrouplist(user, gid, l_gids, &k) < 0)
1021                         return -EINVAL;
1022         } else
1023                 k = 0;
1024
1025         STRV_FOREACH(i, c->supplementary_groups) {
1026                 const char *g;
1027
1028                 if (k >= ngroups_max)
1029                         return -E2BIG;
1030
1031                 g = *i;
1032                 r = get_group_creds(&g, l_gids+k, 0);
1033                 if (r < 0)
1034                         return r;
1035
1036                 k++;
1037         }
1038
1039         /*
1040          * Sets ngids to zero to drop all supplementary groups, happens
1041          * when we are under root and SupplementaryGroups= is empty.
1042          */
1043         if (k == 0) {
1044                 *ngids = 0;
1045                 return 0;
1046         }
1047
1048         /* Otherwise get the final list of supplementary groups */
1049         groups = memdup(l_gids, sizeof(gid_t) * k);
1050         if (!groups)
1051                 return -ENOMEM;
1052
1053         *supplementary_gids = groups;
1054         *ngids = k;
1055
1056         groups = NULL;
1057
1058         return 0;
1059 }
1060
1061 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1062         int r;
1063
1064         /* Handle SupplementaryGroups= if it is not empty */
1065         if (ngids > 0) {
1066                 r = maybe_setgroups(ngids, supplementary_gids);
1067                 if (r < 0)
1068                         return r;
1069         }
1070
1071         if (gid_is_valid(gid)) {
1072                 /* Then set our gids */
1073                 if (setresgid(gid, gid, gid) < 0)
1074                         return -errno;
1075         }
1076
1077         return 0;
1078 }
1079
1080 static int set_securebits(int bits, int mask) {
1081         int current, applied;
1082         current = prctl(PR_GET_SECUREBITS);
1083         if (current < 0)
1084                 return -errno;
1085         /* Clear all securebits defined in mask and set bits */
1086         applied = (current & ~mask) | bits;
1087         if (current == applied)
1088                 return 0;
1089         if (prctl(PR_SET_SECUREBITS, applied) < 0)
1090                 return -errno;
1091         return 1;
1092 }
1093
1094 static int enforce_user(const ExecContext *context, uid_t uid) {
1095         assert(context);
1096         int r;
1097
1098         if (!uid_is_valid(uid))
1099                 return 0;
1100
1101         /* Sets (but doesn't look up) the uid and make sure we keep the
1102          * capabilities while doing so. For setting secure bits the capability CAP_SETPCAP is
1103          * required, so we also need keep-caps in this case.
1104          */
1105
1106         if (context->capability_ambient_set != 0 || context->secure_bits != 0) {
1107
1108                 /* First step: If we need to keep capabilities but
1109                  * drop privileges we need to make sure we keep our
1110                  * caps, while we drop privileges. */
1111                 if (uid != 0) {
1112                         /* Add KEEP_CAPS to the securebits */
1113                         r = set_securebits(1<<SECURE_KEEP_CAPS, 0);
1114                         if (r < 0)
1115                                 return r;
1116                 }
1117         }
1118
1119         /* Second step: actually set the uids */
1120         if (setresuid(uid, uid, uid) < 0)
1121                 return -errno;
1122
1123         /* At this point we should have all necessary capabilities but
1124            are otherwise a normal user. However, the caps might got
1125            corrupted due to the setresuid() so we need clean them up
1126            later. This is done outside of this call. */
1127
1128         return 0;
1129 }
1130
1131 #if HAVE_PAM
1132
1133 static int null_conv(
1134                 int num_msg,
1135                 const struct pam_message **msg,
1136                 struct pam_response **resp,
1137                 void *appdata_ptr) {
1138
1139         /* We don't support conversations */
1140
1141         return PAM_CONV_ERR;
1142 }
1143
1144 #endif
1145
1146 static int setup_pam(
1147                 const char *name,
1148                 const char *user,
1149                 uid_t uid,
1150                 gid_t gid,
1151                 const char *tty,
1152                 char ***env,
1153                 const int fds[], size_t n_fds) {
1154
1155 #if HAVE_PAM
1156
1157         static const struct pam_conv conv = {
1158                 .conv = null_conv,
1159                 .appdata_ptr = NULL
1160         };
1161
1162         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1163         pam_handle_t *handle = NULL;
1164         sigset_t old_ss;
1165         int pam_code = PAM_SUCCESS, r;
1166         char **nv, **e = NULL;
1167         bool close_session = false;
1168         pid_t pam_pid = 0, parent_pid;
1169         int flags = 0;
1170
1171         assert(name);
1172         assert(user);
1173         assert(env);
1174
1175         /* We set up PAM in the parent process, then fork. The child
1176          * will then stay around until killed via PR_GET_PDEATHSIG or
1177          * systemd via the cgroup logic. It will then remove the PAM
1178          * session again. The parent process will exec() the actual
1179          * daemon. We do things this way to ensure that the main PID
1180          * of the daemon is the one we initially fork()ed. */
1181
1182         r = barrier_create(&barrier);
1183         if (r < 0)
1184                 goto fail;
1185
1186         if (log_get_max_level() < LOG_DEBUG)
1187                 flags |= PAM_SILENT;
1188
1189         pam_code = pam_start(name, user, &conv, &handle);
1190         if (pam_code != PAM_SUCCESS) {
1191                 handle = NULL;
1192                 goto fail;
1193         }
1194
1195         if (!tty) {
1196                 _cleanup_free_ char *q = NULL;
1197
1198                 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1199                  * out if that's the case, and read the TTY off it. */
1200
1201                 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1202                         tty = strjoina("/dev/", q);
1203         }
1204
1205         if (tty) {
1206                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1207                 if (pam_code != PAM_SUCCESS)
1208                         goto fail;
1209         }
1210
1211         STRV_FOREACH(nv, *env) {
1212                 pam_code = pam_putenv(handle, *nv);
1213                 if (pam_code != PAM_SUCCESS)
1214                         goto fail;
1215         }
1216
1217         pam_code = pam_acct_mgmt(handle, flags);
1218         if (pam_code != PAM_SUCCESS)
1219                 goto fail;
1220
1221         pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1222         if (pam_code != PAM_SUCCESS)
1223                 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
1224
1225         pam_code = pam_open_session(handle, flags);
1226         if (pam_code != PAM_SUCCESS)
1227                 goto fail;
1228
1229         close_session = true;
1230
1231         e = pam_getenvlist(handle);
1232         if (!e) {
1233                 pam_code = PAM_BUF_ERR;
1234                 goto fail;
1235         }
1236
1237         /* Block SIGTERM, so that we know that it won't get lost in
1238          * the child */
1239
1240         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1241
1242         parent_pid = getpid_cached();
1243
1244         r = safe_fork("(sd-pam)", 0, &pam_pid);
1245         if (r < 0)
1246                 goto fail;
1247         if (r == 0) {
1248                 int sig, ret = EXIT_PAM;
1249
1250                 /* The child's job is to reset the PAM session on
1251                  * termination */
1252                 barrier_set_role(&barrier, BARRIER_CHILD);
1253
1254                 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1255                  * those fds are open here that have been opened by PAM. */
1256                 (void) close_many(fds, n_fds);
1257
1258                 /* Drop privileges - we don't need any to pam_close_session
1259                  * and this will make PR_SET_PDEATHSIG work in most cases.
1260                  * If this fails, ignore the error - but expect sd-pam threads
1261                  * to fail to exit normally */
1262
1263                 r = maybe_setgroups(0, NULL);
1264                 if (r < 0)
1265                         log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1266                 if (setresgid(gid, gid, gid) < 0)
1267                         log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1268                 if (setresuid(uid, uid, uid) < 0)
1269                         log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1270
1271                 (void) ignore_signals(SIGPIPE, -1);
1272
1273                 /* Wait until our parent died. This will only work if
1274                  * the above setresuid() succeeds, otherwise the kernel
1275                  * will not allow unprivileged parents kill their privileged
1276                  * children this way. We rely on the control groups kill logic
1277                  * to do the rest for us. */
1278                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1279                         goto child_finish;
1280
1281                 /* Tell the parent that our setup is done. This is especially
1282                  * important regarding dropping privileges. Otherwise, unit
1283                  * setup might race against our setresuid(2) call.
1284                  *
1285                  * If the parent aborted, we'll detect this below, hence ignore
1286                  * return failure here. */
1287                 (void) barrier_place(&barrier);
1288
1289                 /* Check if our parent process might already have died? */
1290                 if (getppid() == parent_pid) {
1291                         sigset_t ss;
1292
1293                         assert_se(sigemptyset(&ss) >= 0);
1294                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1295
1296                         for (;;) {
1297                                 if (sigwait(&ss, &sig) < 0) {
1298                                         if (errno == EINTR)
1299                                                 continue;
1300
1301                                         goto child_finish;
1302                                 }
1303
1304                                 assert(sig == SIGTERM);
1305                                 break;
1306                         }
1307                 }
1308
1309                 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1310                 if (pam_code != PAM_SUCCESS)
1311                         goto child_finish;
1312
1313                 /* If our parent died we'll end the session */
1314                 if (getppid() != parent_pid) {
1315                         pam_code = pam_close_session(handle, flags);
1316                         if (pam_code != PAM_SUCCESS)
1317                                 goto child_finish;
1318                 }
1319
1320                 ret = 0;
1321
1322         child_finish:
1323                 pam_end(handle, pam_code | flags);
1324                 _exit(ret);
1325         }
1326
1327         barrier_set_role(&barrier, BARRIER_PARENT);
1328
1329         /* If the child was forked off successfully it will do all the
1330          * cleanups, so forget about the handle here. */
1331         handle = NULL;
1332
1333         /* Unblock SIGTERM again in the parent */
1334         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1335
1336         /* We close the log explicitly here, since the PAM modules
1337          * might have opened it, but we don't want this fd around. */
1338         closelog();
1339
1340         /* Synchronously wait for the child to initialize. We don't care for
1341          * errors as we cannot recover. However, warn loudly if it happens. */
1342         if (!barrier_place_and_sync(&barrier))
1343                 log_error("PAM initialization failed");
1344
1345         return strv_free_and_replace(*env, e);
1346
1347 fail:
1348         if (pam_code != PAM_SUCCESS) {
1349                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1350                 r = -EPERM;  /* PAM errors do not map to errno */
1351         } else
1352                 log_error_errno(r, "PAM failed: %m");
1353
1354         if (handle) {
1355                 if (close_session)
1356                         pam_code = pam_close_session(handle, flags);
1357
1358                 pam_end(handle, pam_code | flags);
1359         }
1360
1361         strv_free(e);
1362         closelog();
1363
1364         return r;
1365 #else
1366         return 0;
1367 #endif
1368 }
1369
1370 static void rename_process_from_path(const char *path) {
1371         char process_name[11];
1372         const char *p;
1373         size_t l;
1374
1375         /* This resulting string must fit in 10 chars (i.e. the length
1376          * of "/sbin/init") to look pretty in /bin/ps */
1377
1378         p = basename(path);
1379         if (isempty(p)) {
1380                 rename_process("(...)");
1381                 return;
1382         }
1383
1384         l = strlen(p);
1385         if (l > 8) {
1386                 /* The end of the process name is usually more
1387                  * interesting, since the first bit might just be
1388                  * "systemd-" */
1389                 p = p + l - 8;
1390                 l = 8;
1391         }
1392
1393         process_name[0] = '(';
1394         memcpy(process_name+1, p, l);
1395         process_name[1+l] = ')';
1396         process_name[1+l+1] = 0;
1397
1398         rename_process(process_name);
1399 }
1400
1401 static bool context_has_address_families(const ExecContext *c) {
1402         assert(c);
1403
1404         return c->address_families_allow_list ||
1405                 !set_isempty(c->address_families);
1406 }
1407
1408 static bool context_has_syscall_filters(const ExecContext *c) {
1409         assert(c);
1410
1411         return c->syscall_allow_list ||
1412                 !hashmap_isempty(c->syscall_filter);
1413 }
1414
1415 static bool context_has_syscall_logs(const ExecContext *c) {
1416         assert(c);
1417
1418         return c->syscall_log_allow_list ||
1419                 !hashmap_isempty(c->syscall_log);
1420 }
1421
1422 static bool context_has_no_new_privileges(const ExecContext *c) {
1423         assert(c);
1424
1425         if (c->no_new_privileges)
1426                 return true;
1427
1428         if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1429                 return false;
1430
1431         /* We need NNP if we have any form of seccomp and are unprivileged */
1432         return context_has_address_families(c) ||
1433                 c->memory_deny_write_execute ||
1434                 c->restrict_realtime ||
1435                 c->restrict_suid_sgid ||
1436                 exec_context_restrict_namespaces_set(c) ||
1437                 c->protect_clock ||
1438                 c->protect_kernel_tunables ||
1439                 c->protect_kernel_modules ||
1440                 c->protect_kernel_logs ||
1441                 c->private_devices ||
1442                 context_has_syscall_filters(c) ||
1443                 context_has_syscall_logs(c) ||
1444                 !set_isempty(c->syscall_archs) ||
1445                 c->lock_personality ||
1446                 c->protect_hostname;
1447 }
1448
1449 static bool exec_context_has_credentials(const ExecContext *context) {
1450
1451         assert(context);
1452
1453         return !hashmap_isempty(context->set_credentials) ||
1454                 context->load_credentials;
1455 }
1456
1457 #if HAVE_SECCOMP
1458
1459 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1460
1461         if (is_seccomp_available())
1462                 return false;
1463
1464         log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1465         return true;
1466 }
1467
1468 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1469         uint32_t negative_action, default_action, action;
1470         int r;
1471
1472         assert(u);
1473         assert(c);
1474
1475         if (!context_has_syscall_filters(c))
1476                 return 0;
1477
1478         if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1479                 return 0;
1480
1481         negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1482
1483         if (c->syscall_allow_list) {
1484                 default_action = negative_action;
1485                 action = SCMP_ACT_ALLOW;
1486         } else {
1487                 default_action = SCMP_ACT_ALLOW;
1488                 action = negative_action;
1489         }
1490
1491         if (needs_ambient_hack) {
1492                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1493                 if (r < 0)
1494                         return r;
1495         }
1496
1497         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1498 }
1499
1500 static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1501 #ifdef SCMP_ACT_LOG
1502         uint32_t default_action, action;
1503 #endif
1504
1505         assert(u);
1506         assert(c);
1507
1508         if (!context_has_syscall_logs(c))
1509                 return 0;
1510
1511 #ifdef SCMP_ACT_LOG
1512         if (skip_seccomp_unavailable(u, "SystemCallLog="))
1513                 return 0;
1514
1515         if (c->syscall_log_allow_list) {
1516                 /* Log nothing but the ones listed */
1517                 default_action = SCMP_ACT_ALLOW;
1518                 action = SCMP_ACT_LOG;
1519         } else {
1520                 /* Log everything but the ones listed */
1521                 default_action = SCMP_ACT_LOG;
1522                 action = SCMP_ACT_ALLOW;
1523         }
1524
1525         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1526 #else
1527         /* old libseccomp */
1528         log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1529         return 0;
1530 #endif
1531 }
1532
1533 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1534         assert(u);
1535         assert(c);
1536
1537         if (set_isempty(c->syscall_archs))
1538                 return 0;
1539
1540         if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1541                 return 0;
1542
1543         return seccomp_restrict_archs(c->syscall_archs);
1544 }
1545
1546 static int apply_address_families(const Unit* u, const ExecContext *c) {
1547         assert(u);
1548         assert(c);
1549
1550         if (!context_has_address_families(c))
1551                 return 0;
1552
1553         if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1554                 return 0;
1555
1556         return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1557 }
1558
1559 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1560         assert(u);
1561         assert(c);
1562
1563         if (!c->memory_deny_write_execute)
1564                 return 0;
1565
1566         if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1567                 return 0;
1568
1569         return seccomp_memory_deny_write_execute();
1570 }
1571
1572 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1573         assert(u);
1574         assert(c);
1575
1576         if (!c->restrict_realtime)
1577                 return 0;
1578
1579         if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1580                 return 0;
1581
1582         return seccomp_restrict_realtime();
1583 }
1584
1585 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1586         assert(u);
1587         assert(c);
1588
1589         if (!c->restrict_suid_sgid)
1590                 return 0;
1591
1592         if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1593                 return 0;
1594
1595         return seccomp_restrict_suid_sgid();
1596 }
1597
1598 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1599         assert(u);
1600         assert(c);
1601
1602         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1603          * let's protect even those systems where this is left on in the kernel. */
1604
1605         if (!c->protect_kernel_tunables)
1606                 return 0;
1607
1608         if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1609                 return 0;
1610
1611         return seccomp_protect_sysctl();
1612 }
1613
1614 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1615         assert(u);
1616         assert(c);
1617
1618         /* Turn off module syscalls on ProtectKernelModules=yes */
1619
1620         if (!c->protect_kernel_modules)
1621                 return 0;
1622
1623         if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1624                 return 0;
1625
1626         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1627 }
1628
1629 static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1630         assert(u);
1631         assert(c);
1632
1633         if (!c->protect_kernel_logs)
1634                 return 0;
1635
1636         if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1637                 return 0;
1638
1639         return seccomp_protect_syslog();
1640 }
1641
1642 static int apply_protect_clock(const Unit *u, const ExecContext *c) {
1643         assert(u);
1644         assert(c);
1645
1646         if (!c->protect_clock)
1647                 return 0;
1648
1649         if (skip_seccomp_unavailable(u, "ProtectClock="))
1650                 return 0;
1651
1652         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1653 }
1654
1655 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1656         assert(u);
1657         assert(c);
1658
1659         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1660
1661         if (!c->private_devices)
1662                 return 0;
1663
1664         if (skip_seccomp_unavailable(u, "PrivateDevices="))
1665                 return 0;
1666
1667         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1668 }
1669
1670 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1671         assert(u);
1672         assert(c);
1673
1674         if (!exec_context_restrict_namespaces_set(c))
1675                 return 0;
1676
1677         if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1678                 return 0;
1679
1680         return seccomp_restrict_namespaces(c->restrict_namespaces);
1681 }
1682
1683 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1684         unsigned long personality;
1685         int r;
1686
1687         assert(u);
1688         assert(c);
1689
1690         if (!c->lock_personality)
1691                 return 0;
1692
1693         if (skip_seccomp_unavailable(u, "LockPersonality="))
1694                 return 0;
1695
1696         personality = c->personality;
1697
1698         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1699         if (personality == PERSONALITY_INVALID) {
1700
1701                 r = opinionated_personality(&personality);
1702                 if (r < 0)
1703                         return r;
1704         }
1705
1706         return seccomp_lock_personality(personality);
1707 }
1708
1709 #endif
1710
1711 static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
1712         assert(u);
1713         assert(c);
1714
1715         if (!c->protect_hostname)
1716                 return 0;
1717
1718         if (ns_type_supported(NAMESPACE_UTS)) {
1719                 if (unshare(CLONE_NEWUTS) < 0) {
1720                         if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1721                                 *ret_exit_status = EXIT_NAMESPACE;
1722                                 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1723                         }
1724
1725                         log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1726                 }
1727         } else
1728                 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1729
1730 #if HAVE_SECCOMP
1731         int r;
1732
1733         if (skip_seccomp_unavailable(u, "ProtectHostname="))
1734                 return 0;
1735
1736         r = seccomp_protect_hostname();
1737         if (r < 0) {
1738                 *ret_exit_status = EXIT_SECCOMP;
1739                 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1740         }
1741 #endif
1742
1743         return 0;
1744 }
1745
1746 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1747         assert(idle_pipe);
1748
1749         idle_pipe[1] = safe_close(idle_pipe[1]);
1750         idle_pipe[2] = safe_close(idle_pipe[2]);
1751
1752         if (idle_pipe[0] >= 0) {
1753                 int r;
1754
1755                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1756
1757                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1758                         ssize_t n;
1759
1760                         /* Signal systemd that we are bored and want to continue. */
1761                         n = write(idle_pipe[3], "x", 1);
1762                         if (n > 0)
1763                                 /* Wait for systemd to react to the signal above. */
1764                                 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1765                 }
1766
1767                 idle_pipe[0] = safe_close(idle_pipe[0]);
1768
1769         }
1770
1771         idle_pipe[3] = safe_close(idle_pipe[3]);
1772 }
1773
1774 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1775
1776 static int build_environment(
1777                 const Unit *u,
1778                 const ExecContext *c,
1779                 const ExecParameters *p,
1780                 size_t n_fds,
1781                 const char *home,
1782                 const char *username,
1783                 const char *shell,
1784                 dev_t journal_stream_dev,
1785                 ino_t journal_stream_ino,
1786                 char ***ret) {
1787
1788         _cleanup_strv_free_ char **our_env = NULL;
1789         size_t n_env = 0;
1790         char *x;
1791
1792         assert(u);
1793         assert(c);
1794         assert(p);
1795         assert(ret);
1796
1797 #define N_ENV_VARS 17
1798         our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1799         if (!our_env)
1800                 return -ENOMEM;
1801
1802         if (n_fds > 0) {
1803                 _cleanup_free_ char *joined = NULL;
1804
1805                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1806                         return -ENOMEM;
1807                 our_env[n_env++] = x;
1808
1809                 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1810                         return -ENOMEM;
1811                 our_env[n_env++] = x;
1812
1813                 joined = strv_join(p->fd_names, ":");
1814                 if (!joined)
1815                         return -ENOMEM;
1816
1817                 x = strjoin("LISTEN_FDNAMES=", joined);
1818                 if (!x)
1819                         return -ENOMEM;
1820                 our_env[n_env++] = x;
1821         }
1822
1823         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1824                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1825                         return -ENOMEM;
1826                 our_env[n_env++] = x;
1827
1828                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1829                         return -ENOMEM;
1830                 our_env[n_env++] = x;
1831         }
1832
1833         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1834          * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1835          * check the database directly. */
1836         if (p->flags & EXEC_NSS_BYPASS_BUS) {
1837                 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1838                 if (!x)
1839                         return -ENOMEM;
1840                 our_env[n_env++] = x;
1841         }
1842
1843         if (home) {
1844                 x = strjoin("HOME=", home);
1845                 if (!x)
1846                         return -ENOMEM;
1847
1848                 path_simplify(x + 5, true);
1849                 our_env[n_env++] = x;
1850         }
1851
1852         if (username) {
1853                 x = strjoin("LOGNAME=", username);
1854                 if (!x)
1855                         return -ENOMEM;
1856                 our_env[n_env++] = x;
1857
1858                 x = strjoin("USER=", username);
1859                 if (!x)
1860                         return -ENOMEM;
1861                 our_env[n_env++] = x;
1862         }
1863
1864         if (shell) {
1865                 x = strjoin("SHELL=", shell);
1866                 if (!x)
1867                         return -ENOMEM;
1868
1869                 path_simplify(x + 6, true);
1870                 our_env[n_env++] = x;
1871         }
1872
1873         if (!sd_id128_is_null(u->invocation_id)) {
1874                 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1875                         return -ENOMEM;
1876
1877                 our_env[n_env++] = x;
1878         }
1879
1880         if (exec_context_needs_term(c)) {
1881                 const char *tty_path, *term = NULL;
1882
1883                 tty_path = exec_context_tty_path(c);
1884
1885                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1886                  * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1887                  * container manager passes to PID 1 ends up all the way in the console login shown. */
1888
1889                 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
1890                         term = getenv("TERM");
1891
1892                 if (!term)
1893                         term = default_term_for_tty(tty_path);
1894
1895                 x = strjoin("TERM=", term);
1896                 if (!x)
1897                         return -ENOMEM;
1898                 our_env[n_env++] = x;
1899         }
1900
1901         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1902                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1903                         return -ENOMEM;
1904
1905                 our_env[n_env++] = x;
1906         }
1907
1908         if (c->log_namespace) {
1909                 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1910                 if (!x)
1911                         return -ENOMEM;
1912
1913                 our_env[n_env++] = x;
1914         }
1915
1916         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1917                 _cleanup_free_ char *pre = NULL, *joined = NULL;
1918                 const char *n;
1919
1920                 if (!p->prefix[t])
1921                         continue;
1922
1923                 if (strv_isempty(c->directories[t].paths))
1924                         continue;
1925
1926                 n = exec_directory_env_name_to_string(t);
1927                 if (!n)
1928                         continue;
1929
1930                 pre = strjoin(p->prefix[t], "/");
1931                 if (!pre)
1932                         return -ENOMEM;
1933
1934                 joined = strv_join_full(c->directories[t].paths, ":", pre, true);
1935                 if (!joined)
1936                         return -ENOMEM;
1937
1938                 x = strjoin(n, "=", joined);
1939                 if (!x)
1940                         return -ENOMEM;
1941
1942                 our_env[n_env++] = x;
1943         }
1944
1945         if (exec_context_has_credentials(c) && p->prefix[EXEC_DIRECTORY_RUNTIME]) {
1946                 x = strjoin("CREDENTIALS_DIRECTORY=", p->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
1947                 if (!x)
1948                         return -ENOMEM;
1949
1950                 our_env[n_env++] = x;
1951         }
1952
1953         if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
1954                 return -ENOMEM;
1955
1956         our_env[n_env++] = x;
1957
1958         our_env[n_env++] = NULL;
1959         assert(n_env <= N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1960 #undef N_ENV_VARS
1961
1962         *ret = TAKE_PTR(our_env);
1963
1964         return 0;
1965 }
1966
1967 static int build_pass_environment(const ExecContext *c, char ***ret) {
1968         _cleanup_strv_free_ char **pass_env = NULL;
1969         size_t n_env = 0, n_bufsize = 0;
1970         char **i;
1971
1972         STRV_FOREACH(i, c->pass_environment) {
1973                 _cleanup_free_ char *x = NULL;
1974                 char *v;
1975
1976                 v = getenv(*i);
1977                 if (!v)
1978                         continue;
1979                 x = strjoin(*i, "=", v);
1980                 if (!x)
1981                         return -ENOMEM;
1982
1983                 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1984                         return -ENOMEM;
1985
1986                 pass_env[n_env++] = TAKE_PTR(x);
1987                 pass_env[n_env] = NULL;
1988         }
1989
1990         *ret = TAKE_PTR(pass_env);
1991
1992         return 0;
1993 }
1994
1995 bool exec_needs_mount_namespace(
1996                 const ExecContext *context,
1997                 const ExecParameters *params,
1998                 const ExecRuntime *runtime) {
1999
2000         assert(context);
2001
2002         if (context->root_image)
2003                 return true;
2004
2005         if (!strv_isempty(context->read_write_paths) ||
2006             !strv_isempty(context->read_only_paths) ||
2007             !strv_isempty(context->inaccessible_paths) ||
2008             !strv_isempty(context->exec_paths) ||
2009             !strv_isempty(context->no_exec_paths))
2010                 return true;
2011
2012         if (context->n_bind_mounts > 0)
2013                 return true;
2014
2015         if (context->n_temporary_filesystems > 0)
2016                 return true;
2017
2018         if (context->n_mount_images > 0)
2019                 return true;
2020
2021         if (!IN_SET(context->mount_flags, 0, MS_SHARED))
2022                 return true;
2023
2024         if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
2025                 return true;
2026
2027         if (context->private_devices ||
2028             context->private_mounts ||
2029             context->protect_system != PROTECT_SYSTEM_NO ||
2030             context->protect_home != PROTECT_HOME_NO ||
2031             context->protect_kernel_tunables ||
2032             context->protect_kernel_modules ||
2033             context->protect_kernel_logs ||
2034             context->protect_control_groups ||
2035             context->protect_proc != PROTECT_PROC_DEFAULT ||
2036             context->proc_subset != PROC_SUBSET_ALL)
2037                 return true;
2038
2039         if (context->root_directory) {
2040                 if (exec_context_get_effective_mount_apivfs(context))
2041                         return true;
2042
2043                 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2044                         if (params && !params->prefix[t])
2045                                 continue;
2046
2047                         if (!strv_isempty(context->directories[t].paths))
2048                                 return true;
2049                 }
2050         }
2051
2052         if (context->dynamic_user &&
2053             (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
2054              !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
2055              !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
2056                 return true;
2057
2058         if (context->log_namespace)
2059                 return true;
2060
2061         return false;
2062 }
2063
2064 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
2065         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2066         _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
2067         _cleanup_close_ int unshare_ready_fd = -1;
2068         _cleanup_(sigkill_waitp) pid_t pid = 0;
2069         uint64_t c = 1;
2070         ssize_t n;
2071         int r;
2072
2073         /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2074          * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2075          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2076          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2077          * which waits for the parent to create the new user namespace while staying in the original namespace. The
2078          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2079          * continues execution normally.
2080          * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2081          * does not need CAP_SETUID to write the single line mapping to itself. */
2082
2083         /* Can only set up multiple mappings with CAP_SETUID. */
2084         if (have_effective_cap(CAP_SETUID) && uid != ouid && uid_is_valid(uid))
2085                 r = asprintf(&uid_map,
2086                              UID_FMT " " UID_FMT " 1\n"     /* Map $OUID → $OUID */
2087                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
2088                              ouid, ouid, uid, uid);
2089         else
2090                 r = asprintf(&uid_map,
2091                              UID_FMT " " UID_FMT " 1\n",    /* Map $OUID → $OUID */
2092                              ouid, ouid);
2093
2094         if (r < 0)
2095                 return -ENOMEM;
2096
2097         /* Can only set up multiple mappings with CAP_SETGID. */
2098         if (have_effective_cap(CAP_SETGID) && gid != ogid && gid_is_valid(gid))
2099                 r = asprintf(&gid_map,
2100                              GID_FMT " " GID_FMT " 1\n"     /* Map $OGID → $OGID */
2101                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
2102                              ogid, ogid, gid, gid);
2103         else
2104                 r = asprintf(&gid_map,
2105                              GID_FMT " " GID_FMT " 1\n",    /* Map $OGID -> $OGID */
2106                              ogid, ogid);
2107
2108         if (r < 0)
2109                 return -ENOMEM;
2110
2111         /* Create a communication channel so that the parent can tell the child when it finished creating the user
2112          * namespace. */
2113         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2114         if (unshare_ready_fd < 0)
2115                 return -errno;
2116
2117         /* Create a communication channel so that the child can tell the parent a proper error code in case it
2118          * failed. */
2119         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2120                 return -errno;
2121
2122         r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2123         if (r < 0)
2124                 return r;
2125         if (r == 0) {
2126                 _cleanup_close_ int fd = -1;
2127                 const char *a;
2128                 pid_t ppid;
2129
2130                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2131                  * here, after the parent opened its own user namespace. */
2132
2133                 ppid = getppid();
2134                 errno_pipe[0] = safe_close(errno_pipe[0]);
2135
2136                 /* Wait until the parent unshared the user namespace */
2137                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2138                         r = -errno;
2139                         goto child_fail;
2140                 }
2141
2142                 /* Disable the setgroups() system call in the child user namespace, for good. */
2143                 a = procfs_file_alloca(ppid, "setgroups");
2144                 fd = open(a, O_WRONLY|O_CLOEXEC);
2145                 if (fd < 0) {
2146                         if (errno != ENOENT) {
2147                                 r = -errno;
2148                                 goto child_fail;
2149                         }
2150
2151                         /* If the file is missing the kernel is too old, let's continue anyway. */
2152                 } else {
2153                         if (write(fd, "deny\n", 5) < 0) {
2154                                 r = -errno;
2155                                 goto child_fail;
2156                         }
2157
2158                         fd = safe_close(fd);
2159                 }
2160
2161                 /* First write the GID map */
2162                 a = procfs_file_alloca(ppid, "gid_map");
2163                 fd = open(a, O_WRONLY|O_CLOEXEC);
2164                 if (fd < 0) {
2165                         r = -errno;
2166                         goto child_fail;
2167                 }
2168                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2169                         r = -errno;
2170                         goto child_fail;
2171                 }
2172                 fd = safe_close(fd);
2173
2174                 /* The write the UID map */
2175                 a = procfs_file_alloca(ppid, "uid_map");
2176                 fd = open(a, O_WRONLY|O_CLOEXEC);
2177                 if (fd < 0) {
2178                         r = -errno;
2179                         goto child_fail;
2180                 }
2181                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2182                         r = -errno;
2183                         goto child_fail;
2184                 }
2185
2186                 _exit(EXIT_SUCCESS);
2187
2188         child_fail:
2189                 (void) write(errno_pipe[1], &r, sizeof(r));
2190                 _exit(EXIT_FAILURE);
2191         }
2192
2193         errno_pipe[1] = safe_close(errno_pipe[1]);
2194
2195         if (unshare(CLONE_NEWUSER) < 0)
2196                 return -errno;
2197
2198         /* Let the child know that the namespace is ready now */
2199         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2200                 return -errno;
2201
2202         /* Try to read an error code from the child */
2203         n = read(errno_pipe[0], &r, sizeof(r));
2204         if (n < 0)
2205                 return -errno;
2206         if (n == sizeof(r)) { /* an error code was sent to us */
2207                 if (r < 0)
2208                         return r;
2209                 return -EIO;
2210         }
2211         if (n != 0) /* on success we should have read 0 bytes */
2212                 return -EIO;
2213
2214         r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
2215         pid = 0;
2216         if (r < 0)
2217                 return r;
2218         if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2219                 return -EIO;
2220
2221         return 0;
2222 }
2223
2224 static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2225         if (!context->dynamic_user)
2226                 return false;
2227
2228         if (type == EXEC_DIRECTORY_CONFIGURATION)
2229                 return false;
2230
2231         if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2232                 return false;
2233
2234         return true;
2235 }
2236
2237 static int setup_exec_directory(
2238                 const ExecContext *context,
2239                 const ExecParameters *params,
2240                 uid_t uid,
2241                 gid_t gid,
2242                 ExecDirectoryType type,
2243                 int *exit_status) {
2244
2245         static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2246                 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2247                 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2248                 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2249                 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2250                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2251         };
2252         char **rt;
2253         int r;
2254
2255         assert(context);
2256         assert(params);
2257         assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2258         assert(exit_status);
2259
2260         if (!params->prefix[type])
2261                 return 0;
2262
2263         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2264                 if (!uid_is_valid(uid))
2265                         uid = 0;
2266                 if (!gid_is_valid(gid))
2267                         gid = 0;
2268         }
2269
2270         STRV_FOREACH(rt, context->directories[type].paths) {
2271                 _cleanup_free_ char *p = NULL, *pp = NULL;
2272
2273                 p = path_join(params->prefix[type], *rt);
2274                 if (!p) {
2275                         r = -ENOMEM;
2276                         goto fail;
2277                 }
2278
2279                 r = mkdir_parents_label(p, 0755);
2280                 if (r < 0)
2281                         goto fail;
2282
2283                 if (exec_directory_is_private(context, type)) {
2284                         _cleanup_free_ char *private_root = NULL;
2285
2286                         /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2287                          * case we want to avoid leaving a directory around fully accessible that is owned by
2288                          * a dynamic user whose UID is later on reused. To lock this down we use the same
2289                          * trick used by container managers to prohibit host users to get access to files of
2290                          * the same UID in containers: we place everything inside a directory that has an
2291                          * access mode of 0700 and is owned root:root, so that it acts as security boundary
2292                          * for unprivileged host code. We then use fs namespacing to make this directory
2293                          * permeable for the service itself.
2294                          *
2295                          * Specifically: for a service which wants a special directory "foo/" we first create
2296                          * a directory "private/" with access mode 0700 owned by root:root. Then we place
2297                          * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2298                          * "private/foo". This way, privileged host users can access "foo/" as usual, but
2299                          * unprivileged host users can't look into it. Inside of the namespace of the unit
2300                          * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2301                          * "private/foo/" is mounted under the same name, thus disabling the access boundary
2302                          * for the service and making sure it only gets access to the dirs it needs but no
2303                          * others. Tricky? Yes, absolutely, but it works!
2304                          *
2305                          * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2306                          * to be owned by the service itself.
2307                          *
2308                          * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2309                          * for sharing files or sockets with other services. */
2310
2311                         private_root = path_join(params->prefix[type], "private");
2312                         if (!private_root) {
2313                                 r = -ENOMEM;
2314                                 goto fail;
2315                         }
2316
2317                         /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2318                         r = mkdir_safe_label(private_root, 0700, 0, 0, MKDIR_WARN_MODE);
2319                         if (r < 0)
2320                                 goto fail;
2321
2322                         pp = path_join(private_root, *rt);
2323                         if (!pp) {
2324                                 r = -ENOMEM;
2325                                 goto fail;
2326                         }
2327
2328                         /* Create all directories between the configured directory and this private root, and mark them 0755 */
2329                         r = mkdir_parents_label(pp, 0755);
2330                         if (r < 0)
2331                                 goto fail;
2332
2333                         if (is_dir(p, false) > 0 &&
2334                             (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2335
2336                                 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2337                                  * it over. Most likely the service has been upgraded from one that didn't use
2338                                  * DynamicUser=1, to one that does. */
2339
2340                                 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2341                                          "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2342                                          exec_directory_type_to_string(type), p, pp);
2343
2344                                 if (rename(p, pp) < 0) {
2345                                         r = -errno;
2346                                         goto fail;
2347                                 }
2348                         } else {
2349                                 /* Otherwise, create the actual directory for the service */
2350
2351                                 r = mkdir_label(pp, context->directories[type].mode);
2352                                 if (r < 0 && r != -EEXIST)
2353                                         goto fail;
2354                         }
2355
2356                         /* And link it up from the original place */
2357                         r = symlink_idempotent(pp, p, true);
2358                         if (r < 0)
2359                                 goto fail;
2360
2361                 } else {
2362                         _cleanup_free_ char *target = NULL;
2363
2364                         if (type != EXEC_DIRECTORY_CONFIGURATION &&
2365                             readlink_and_make_absolute(p, &target) >= 0) {
2366                                 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2367
2368                                 /* This already exists and is a symlink? Interesting. Maybe it's one created
2369                                  * by DynamicUser=1 (see above)?
2370                                  *
2371                                  * We do this for all directory types except for ConfigurationDirectory=,
2372                                  * since they all support the private/ symlink logic at least in some
2373                                  * configurations, see above. */
2374
2375                                 r = chase_symlinks(target, NULL, 0, &target_resolved, NULL);
2376                                 if (r < 0)
2377                                         goto fail;
2378
2379                                 q = path_join(params->prefix[type], "private", *rt);
2380                                 if (!q) {
2381                                         r = -ENOMEM;
2382                                         goto fail;
2383                                 }
2384
2385                                 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2386                                 r = chase_symlinks(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2387                                 if (r < 0)
2388                                         goto fail;
2389
2390                                 if (path_equal(q_resolved, target_resolved)) {
2391
2392                                         /* Hmm, apparently DynamicUser= was once turned on for this service,
2393                                          * but is no longer. Let's move the directory back up. */
2394
2395                                         log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2396                                                  "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2397                                                  exec_directory_type_to_string(type), q, p);
2398
2399                                         if (unlink(p) < 0) {
2400                                                 r = -errno;
2401                                                 goto fail;
2402                                         }
2403
2404                                         if (rename(q, p) < 0) {
2405                                                 r = -errno;
2406                                                 goto fail;
2407                                         }
2408                                 }
2409                         }
2410
2411                         r = mkdir_label(p, context->directories[type].mode);
2412                         if (r < 0) {
2413                                 if (r != -EEXIST)
2414                                         goto fail;
2415
2416                                 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2417                                         struct stat st;
2418
2419                                         /* Don't change the owner/access mode of the configuration directory,
2420                                          * as in the common case it is not written to by a service, and shall
2421                                          * not be writable. */
2422
2423                                         if (stat(p, &st) < 0) {
2424                                                 r = -errno;
2425                                                 goto fail;
2426                                         }
2427
2428                                         /* Still complain if the access mode doesn't match */
2429                                         if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2430                                                 log_warning("%s \'%s\' already exists but the mode is different. "
2431                                                             "(File system: %o %sMode: %o)",
2432                                                             exec_directory_type_to_string(type), *rt,
2433                                                             st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2434
2435                                         continue;
2436                                 }
2437                         }
2438                 }
2439
2440                 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2441                  * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2442                  * current UID/GID ownership.) */
2443                 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2444                 if (r < 0)
2445                         goto fail;
2446
2447                 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2448                  * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2449                  * assignments to exist.*/
2450                 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
2451                 if (r < 0)
2452                         goto fail;
2453         }
2454
2455         return 0;
2456
2457 fail:
2458         *exit_status = exit_status_table[type];
2459         return r;
2460 }
2461
2462 static int write_credential(
2463                 int dfd,
2464                 const char *id,
2465                 const void *data,
2466                 size_t size,
2467                 uid_t uid,
2468                 bool ownership_ok) {
2469
2470         _cleanup_(unlink_and_freep) char *tmp = NULL;
2471         _cleanup_close_ int fd = -1;
2472         int r;
2473
2474         r = tempfn_random_child("", "cred", &tmp);
2475         if (r < 0)
2476                 return r;
2477
2478         fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
2479         if (fd < 0) {
2480                 tmp = mfree(tmp);
2481                 return -errno;
2482         }
2483
2484         r = loop_write(fd, data, size, /* do_pool = */ false);
2485         if (r < 0)
2486                 return r;
2487
2488         if (fchmod(fd, 0400) < 0) /* Take away "w" bit */
2489                 return -errno;
2490
2491         if (uid_is_valid(uid) && uid != getuid()) {
2492                 r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
2493                 if (r < 0) {
2494                         if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2495                                 return r;
2496
2497                         if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want
2498                                             * to express: that the user gets read access and nothing
2499                                             * else. But if the backing fs can't support that (e.g. ramfs)
2500                                             * then we can use file ownership instead. But that's only safe if
2501                                             * we can then re-mount the whole thing read-only, so that the
2502                                             * user can no longer chmod() the file to gain write access. */
2503                                 return r;
2504
2505                         if (fchown(fd, uid, (gid_t) -1) < 0)
2506                                 return -errno;
2507                 }
2508         }
2509
2510         if (renameat(dfd, tmp, dfd, id) < 0)
2511                 return -errno;
2512
2513         tmp = mfree(tmp);
2514         return 0;
2515 }
2516
2517 #define CREDENTIALS_BYTES_MAX (1024LU * 1024LU) /* Refuse to pass more than 1M, after all this is unswappable memory */
2518
2519 static int acquire_credentials(
2520                 const ExecContext *context,
2521                 const ExecParameters *params,
2522                 const char *unit,
2523                 const char *p,
2524                 uid_t uid,
2525                 bool ownership_ok) {
2526
2527         uint64_t left = CREDENTIALS_BYTES_MAX;
2528         _cleanup_close_ int dfd = -1;
2529         ExecSetCredential *sc;
2530         char **id, **fn;
2531         int r;
2532
2533         assert(context);
2534         assert(p);
2535
2536         dfd = open(p, O_DIRECTORY|O_CLOEXEC);
2537         if (dfd < 0)
2538                 return -errno;
2539
2540         /* First we use the literally specified credentials. Note that they might be overridden again below,
2541          * and thus act as a "default" if the same credential is specified multiple times */
2542         HASHMAP_FOREACH(sc, context->set_credentials) {
2543                 size_t add;
2544
2545                 add = strlen(sc->id) + sc->size;
2546                 if (add > left)
2547                         return -E2BIG;
2548
2549                 r = write_credential(dfd, sc->id, sc->data, sc->size, uid, ownership_ok);
2550                 if (r < 0)
2551                         return r;
2552
2553                 left -= add;
2554         }
2555
2556         /* Then, load credential off disk (or acquire via AF_UNIX socket) */
2557         STRV_FOREACH_PAIR(id, fn, context->load_credentials) {
2558                 ReadFullFileFlags flags = READ_FULL_FILE_SECURE;
2559                 _cleanup_(erase_and_freep) char *data = NULL;
2560                 _cleanup_free_ char *j = NULL, *bindname = NULL;
2561                 const char *source;
2562                 size_t size, add;
2563
2564                 if (path_is_absolute(*fn)) {
2565                         /* If this is an absolute path, read the data directly from it, and support AF_UNIX sockets */
2566                         source = *fn;
2567                         flags |= READ_FULL_FILE_CONNECT_SOCKET;
2568
2569                         /* Pass some minimal info about the unit and the credential name we are looking to acquire
2570                          * via the source socket address in case we read off an AF_UNIX socket. */
2571                         if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, *id) < 0)
2572                                 return -ENOMEM;
2573
2574                 } else if (params->received_credentials) {
2575                         /* If this is a relative path, take it relative to the credentials we received
2576                          * ourselves. We don't support the AF_UNIX stuff in this mode, since we are operating
2577                          * on a credential store, i.e. this is guaranteed to be regular files. */
2578                         j = path_join(params->received_credentials, *fn);
2579                         if (!j)
2580                                 return -ENOMEM;
2581
2582                         source = j;
2583                 } else
2584                         source = NULL;
2585
2586
2587                 if (source)
2588                         r = read_full_file_full(AT_FDCWD, source, UINT64_MAX, SIZE_MAX, flags, bindname, &data, &size);
2589                 else
2590                         r = -ENOENT;
2591                 if (r == -ENOENT &&
2592                     faccessat(dfd, *id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0) /* If the source file doesn't exist, but we already acquired the key otherwise, then don't fail */
2593                         continue;
2594                 if (r < 0)
2595                         return r;
2596
2597                 add = strlen(*id) + size;
2598                 if (add > left)
2599                         return -E2BIG;
2600
2601                 r = write_credential(dfd, *id, data, size, uid, ownership_ok);
2602                 if (r < 0)
2603                         return r;
2604
2605                 left -= add;
2606         }
2607
2608         if (fchmod(dfd, 0500) < 0) /* Now take away the "w" bit */
2609                 return -errno;
2610
2611         /* After we created all keys with the right perms, also make sure the credential store as a whole is
2612          * accessible */
2613
2614         if (uid_is_valid(uid) && uid != getuid()) {
2615                 r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE);
2616                 if (r < 0) {
2617                         if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2618                                 return r;
2619
2620                         if (!ownership_ok)
2621                                 return r;
2622
2623                         if (fchown(dfd, uid, (gid_t) -1) < 0)
2624                                 return -errno;
2625                 }
2626         }
2627
2628         return 0;
2629 }
2630
2631 static int setup_credentials_internal(
2632                 const ExecContext *context,
2633                 const ExecParameters *params,
2634                 const char *unit,
2635                 const char *final,        /* This is where the credential store shall eventually end up at */
2636                 const char *workspace,    /* This is where we can prepare it before moving it to the final place */
2637                 bool reuse_workspace,     /* Whether to reuse any existing workspace mount if it already is a mount */
2638                 bool must_mount,          /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
2639                 uid_t uid) {
2640
2641         int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
2642                                    * if we mounted something; false if we definitely can't mount anything */
2643         bool final_mounted;
2644         const char *where;
2645
2646         assert(context);
2647         assert(final);
2648         assert(workspace);
2649
2650         if (reuse_workspace) {
2651                 r = path_is_mount_point(workspace, NULL, 0);
2652                 if (r < 0)
2653                         return r;
2654                 if (r > 0)
2655                         workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse it, let's keep this in mind */
2656                 else
2657                         workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
2658         } else
2659                 workspace_mounted = -1; /* ditto */
2660
2661         r = path_is_mount_point(final, NULL, 0);
2662         if (r < 0)
2663                 return r;
2664         if (r > 0) {
2665                 /* If the final place already has something mounted, we use that. If the workspace also has
2666                  * something mounted we assume it's actually the same mount (but with MS_RDONLY
2667                  * different). */
2668                 final_mounted = true;
2669
2670                 if (workspace_mounted < 0) {
2671                         /* If the final place is mounted, but the workspace we isn't, then let's bind mount
2672                          * the final version to the workspace, and make it writable, so that we can make
2673                          * changes */
2674
2675                         r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
2676                         if (r < 0)
2677                                 return r;
2678
2679                         r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
2680                         if (r < 0)
2681                                 return r;
2682
2683                         workspace_mounted = true;
2684                 }
2685         } else
2686                 final_mounted = false;
2687
2688         if (workspace_mounted < 0) {
2689                 /* Nothing is mounted on the workspace yet, let's try to mount something now */
2690                 for (int try = 0;; try++) {
2691
2692                         if (try == 0) {
2693                                 /* Try "ramfs" first, since it's not swap backed */
2694                                 r = mount_nofollow_verbose(LOG_DEBUG, "ramfs", workspace, "ramfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, "mode=0700");
2695                                 if (r >= 0) {
2696                                         workspace_mounted = true;
2697                                         break;
2698                                 }
2699
2700                         } else if (try == 1) {
2701                                 _cleanup_free_ char *opts = NULL;
2702
2703                                 if (asprintf(&opts, "mode=0700,nr_inodes=1024,size=%lu", CREDENTIALS_BYTES_MAX) < 0)
2704                                         return -ENOMEM;
2705
2706                                 /* Fall back to "tmpfs" otherwise */
2707                                 r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", workspace, "tmpfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, opts);
2708                                 if (r >= 0) {
2709                                         workspace_mounted = true;
2710                                         break;
2711                                 }
2712
2713                         } else {
2714                                 /* If that didn't work, try to make a bind mount from the final to the workspace, so that we can make it writable there. */
2715                                 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
2716                                 if (r < 0) {
2717                                         if (!ERRNO_IS_PRIVILEGE(r)) /* Propagate anything that isn't a permission problem */
2718                                                 return r;
2719
2720                                         if (must_mount) /* If we it's not OK to use the plain directory
2721                                                          * fallback, propagate all errors too */
2722                                                 return r;
2723
2724                                         /* If we lack privileges to bind mount stuff, then let's gracefully
2725                                          * proceed for compat with container envs, and just use the final dir
2726                                          * as is. */
2727
2728                                         workspace_mounted = false;
2729                                         break;
2730                                 }
2731
2732                                 /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
2733                                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
2734                                 if (r < 0)
2735                                         return r;
2736
2737                                 workspace_mounted = true;
2738                                 break;
2739                         }
2740                 }
2741         }
2742
2743         assert(!must_mount || workspace_mounted > 0);
2744         where = workspace_mounted ? workspace : final;
2745
2746         r = acquire_credentials(context, params, unit, where, uid, workspace_mounted);
2747         if (r < 0)
2748                 return r;
2749
2750         if (workspace_mounted) {
2751                 /* Make workspace read-only now, so that any bind mount we make from it defaults to read-only too */
2752                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
2753                 if (r < 0)
2754                         return r;
2755
2756                 /* And mount it to the final place, read-only */
2757                 if (final_mounted)
2758                         r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
2759                 else
2760                         r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
2761                 if (r < 0)
2762                         return r;
2763         } else {
2764                 _cleanup_free_ char *parent = NULL;
2765
2766                 /* If we do not have our own mount put used the plain directory fallback, then we need to
2767                  * open access to the top-level credential directory and the per-service directory now */
2768
2769                 parent = dirname_malloc(final);
2770                 if (!parent)
2771                         return -ENOMEM;
2772                 if (chmod(parent, 0755) < 0)
2773                         return -errno;
2774         }
2775
2776         return 0;
2777 }
2778
2779 static int setup_credentials(
2780                 const ExecContext *context,
2781                 const ExecParameters *params,
2782                 const char *unit,
2783                 uid_t uid) {
2784
2785         _cleanup_free_ char *p = NULL, *q = NULL;
2786         const char *i;
2787         int r;
2788
2789         assert(context);
2790         assert(params);
2791
2792         if (!exec_context_has_credentials(context))
2793                 return 0;
2794
2795         if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
2796                 return -EINVAL;
2797
2798         /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
2799          * and the subdir we mount over with a read-only file system readable by the service's user */
2800         q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials");
2801         if (!q)
2802                 return -ENOMEM;
2803
2804         r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */
2805         if (r < 0 && r != -EEXIST)
2806                 return r;
2807
2808         p = path_join(q, unit);
2809         if (!p)
2810                 return -ENOMEM;
2811
2812         r = mkdir_label(p, 0700); /* per-unit dir: private to user */
2813         if (r < 0 && r != -EEXIST)
2814                 return r;
2815
2816         r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
2817         if (r < 0) {
2818                 _cleanup_free_ char *t = NULL, *u = NULL;
2819
2820                 /* If this is not a privilege or support issue then propagate the error */
2821                 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2822                         return r;
2823
2824                 /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
2825                  * it into place, so that users can't access half-initialized credential stores. */
2826                 t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
2827                 if (!t)
2828                         return -ENOMEM;
2829
2830                 /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
2831                  * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
2832                  * after it is fully set up */
2833                 u = path_join(t, unit);
2834                 if (!u)
2835                         return -ENOMEM;
2836
2837                 FOREACH_STRING(i, t, u) {
2838                         r = mkdir_label(i, 0700);
2839                         if (r < 0 && r != -EEXIST)
2840                                 return r;
2841                 }
2842
2843                 r = setup_credentials_internal(
2844                                 context,
2845                                 params,
2846                                 unit,
2847                                 p,       /* final mount point */
2848                                 u,       /* temporary workspace to overmount */
2849                                 true,    /* reuse the workspace if it is already a mount */
2850                                 false,   /* it's OK to fall back to a plain directory if we can't mount anything */
2851                                 uid);
2852
2853                 (void) rmdir(u); /* remove the workspace again if we can. */
2854
2855                 if (r < 0)
2856                         return r;
2857
2858         } else if (r == 0) {
2859
2860                 /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
2861                  * we can use the same directory for all cases, after turning off propagation. Question
2862                  * though is: where do we turn off propagation exactly, and where do we place the workspace
2863                  * directory? We need some place that is guaranteed to be a mount point in the host, and
2864                  * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
2865                  * since we ultimately want to move the resulting file system there, i.e. we need propagation
2866                  * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
2867                  * would be visible in the host mount table all the time, which we want to avoid. Hence, what
2868                  * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
2869                  * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
2870                  * propagation on the former, and then overmount the latter.
2871                  *
2872                  * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
2873                  * for this purpose, but there are few other candidates that work equally well for us, and
2874                  * given that the we do this in a privately namespaced short-lived single-threaded process
2875                  * that no one else sees this should be OK to do.*/
2876
2877                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL); /* Turn off propagation from our namespace to host */
2878                 if (r < 0)
2879                         goto child_fail;
2880
2881                 r = setup_credentials_internal(
2882                                 context,
2883                                 params,
2884                                 unit,
2885                                 p,           /* final mount point */
2886                                 "/dev/shm",  /* temporary workspace to overmount */
2887                                 false,       /* do not reuse /dev/shm if it is already a mount, under no circumstances */
2888                                 true,        /* insist that something is mounted, do not allow fallback to plain directory */
2889                                 uid);
2890                 if (r < 0)
2891                         goto child_fail;
2892
2893                 _exit(EXIT_SUCCESS);
2894
2895         child_fail:
2896                 _exit(EXIT_FAILURE);
2897         }
2898
2899         return 0;
2900 }
2901
2902 #if ENABLE_SMACK
2903 static int setup_smack(
2904                 const ExecContext *context,
2905                 int executable_fd) {
2906         int r;
2907
2908         assert(context);
2909         assert(executable_fd >= 0);
2910
2911         if (context->smack_process_label) {
2912                 r = mac_smack_apply_pid(0, context->smack_process_label);
2913                 if (r < 0)
2914                         return r;
2915         }
2916 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2917         else {
2918                 _cleanup_free_ char *exec_label = NULL;
2919
2920                 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
2921                 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
2922                         return r;
2923
2924                 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2925                 if (r < 0)
2926                         return r;
2927         }
2928 #endif
2929
2930         return 0;
2931 }
2932 #endif
2933
2934 static int compile_bind_mounts(
2935                 const ExecContext *context,
2936                 const ExecParameters *params,
2937                 BindMount **ret_bind_mounts,
2938                 size_t *ret_n_bind_mounts,
2939                 char ***ret_empty_directories) {
2940
2941         _cleanup_strv_free_ char **empty_directories = NULL;
2942         BindMount *bind_mounts;
2943         size_t n, h = 0;
2944         int r;
2945
2946         assert(context);
2947         assert(params);
2948         assert(ret_bind_mounts);
2949         assert(ret_n_bind_mounts);
2950         assert(ret_empty_directories);
2951
2952         n = context->n_bind_mounts;
2953         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2954                 if (!params->prefix[t])
2955                         continue;
2956
2957                 n += strv_length(context->directories[t].paths);
2958         }
2959
2960         if (n <= 0) {
2961                 *ret_bind_mounts = NULL;
2962                 *ret_n_bind_mounts = 0;
2963                 *ret_empty_directories = NULL;
2964                 return 0;
2965         }
2966
2967         bind_mounts = new(BindMount, n);
2968         if (!bind_mounts)
2969                 return -ENOMEM;
2970
2971         for (size_t i = 0; i < context->n_bind_mounts; i++) {
2972                 BindMount *item = context->bind_mounts + i;
2973                 char *s, *d;
2974
2975                 s = strdup(item->source);
2976                 if (!s) {
2977                         r = -ENOMEM;
2978                         goto finish;
2979                 }
2980
2981                 d = strdup(item->destination);
2982                 if (!d) {
2983                         free(s);
2984                         r = -ENOMEM;
2985                         goto finish;
2986                 }
2987
2988                 bind_mounts[h++] = (BindMount) {
2989                         .source = s,
2990                         .destination = d,
2991                         .read_only = item->read_only,
2992                         .recursive = item->recursive,
2993                         .ignore_enoent = item->ignore_enoent,
2994                 };
2995         }
2996
2997         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2998                 char **suffix;
2999
3000                 if (!params->prefix[t])
3001                         continue;
3002
3003                 if (strv_isempty(context->directories[t].paths))
3004                         continue;
3005
3006                 if (exec_directory_is_private(context, t) &&
3007                     !exec_context_with_rootfs(context)) {
3008                         char *private_root;
3009
3010                         /* So this is for a dynamic user, and we need to make sure the process can access its own
3011                          * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3012                          * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3013
3014                         private_root = path_join(params->prefix[t], "private");
3015                         if (!private_root) {
3016                                 r = -ENOMEM;
3017                                 goto finish;
3018                         }
3019
3020                         r = strv_consume(&empty_directories, private_root);
3021                         if (r < 0)
3022                                 goto finish;
3023                 }
3024
3025                 STRV_FOREACH(suffix, context->directories[t].paths) {
3026                         char *s, *d;
3027
3028                         if (exec_directory_is_private(context, t))
3029                                 s = path_join(params->prefix[t], "private", *suffix);
3030                         else
3031                                 s = path_join(params->prefix[t], *suffix);
3032                         if (!s) {
3033                                 r = -ENOMEM;
3034                                 goto finish;
3035                         }
3036
3037                         if (exec_directory_is_private(context, t) &&
3038                             exec_context_with_rootfs(context))
3039                                 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3040                                  * directory is not created on the root directory. So, let's bind-mount the directory
3041                                  * on the 'non-private' place. */
3042                                 d = path_join(params->prefix[t], *suffix);
3043                         else
3044                                 d = strdup(s);
3045                         if (!d) {
3046                                 free(s);
3047                                 r = -ENOMEM;
3048                                 goto finish;
3049                         }
3050
3051                         bind_mounts[h++] = (BindMount) {
3052                                 .source = s,
3053                                 .destination = d,
3054                                 .read_only = false,
3055                                 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
3056                                 .recursive = true,
3057                                 .ignore_enoent = false,
3058                         };
3059                 }
3060         }
3061
3062         assert(h == n);
3063
3064         *ret_bind_mounts = bind_mounts;
3065         *ret_n_bind_mounts = n;
3066         *ret_empty_directories = TAKE_PTR(empty_directories);
3067
3068         return (int) n;
3069
3070 finish:
3071         bind_mount_free_many(bind_mounts, h);
3072         return r;
3073 }
3074
3075 static bool insist_on_sandboxing(
3076                 const ExecContext *context,
3077                 const char *root_dir,
3078                 const char *root_image,
3079                 const BindMount *bind_mounts,
3080                 size_t n_bind_mounts) {
3081
3082         assert(context);
3083         assert(n_bind_mounts == 0 || bind_mounts);
3084
3085         /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
3086          * would alter the view on the file system beyond making things read-only or invisible, i.e. would
3087          * rearrange stuff in a way we cannot ignore gracefully. */
3088
3089         if (context->n_temporary_filesystems > 0)
3090                 return true;
3091
3092         if (root_dir || root_image)
3093                 return true;
3094
3095         if (context->n_mount_images > 0)
3096                 return true;
3097
3098         if (context->dynamic_user)
3099                 return true;
3100
3101         /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3102          * essential. */
3103         for (size_t i = 0; i < n_bind_mounts; i++)
3104                 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
3105                         return true;
3106
3107         if (context->log_namespace)
3108                 return true;
3109
3110         return false;
3111 }
3112
3113 static int apply_mount_namespace(
3114                 const Unit *u,
3115                 ExecCommandFlags command_flags,
3116                 const ExecContext *context,
3117                 const ExecParameters *params,
3118                 const ExecRuntime *runtime,
3119                 char **error_path) {
3120
3121         _cleanup_strv_free_ char **empty_directories = NULL;
3122         const char *tmp_dir = NULL, *var_tmp_dir = NULL;
3123         const char *root_dir = NULL, *root_image = NULL;
3124         _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL;
3125         NamespaceInfo ns_info;
3126         bool needs_sandboxing;
3127         BindMount *bind_mounts = NULL;
3128         size_t n_bind_mounts = 0;
3129         int r;
3130
3131         assert(context);
3132
3133         if (params->flags & EXEC_APPLY_CHROOT) {
3134                 root_image = context->root_image;
3135
3136                 if (!root_image)
3137                         root_dir = context->root_directory;
3138         }
3139
3140         r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3141         if (r < 0)
3142                 return r;
3143
3144         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3145         if (needs_sandboxing) {
3146                 /* The runtime struct only contains the parent of the private /tmp,
3147                  * which is non-accessible to world users. Inside of it there's a /tmp
3148                  * that is sticky, and that's the one we want to use here.
3149                  * This does not apply when we are using /run/systemd/empty as fallback. */
3150
3151                 if (context->private_tmp && runtime) {
3152                         if (streq_ptr(runtime->tmp_dir, RUN_SYSTEMD_EMPTY))
3153                                 tmp_dir = runtime->tmp_dir;
3154                         else if (runtime->tmp_dir)
3155                                 tmp_dir = strjoina(runtime->tmp_dir, "/tmp");
3156
3157                         if (streq_ptr(runtime->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3158                                 var_tmp_dir = runtime->var_tmp_dir;
3159                         else if (runtime->var_tmp_dir)
3160                                 var_tmp_dir = strjoina(runtime->var_tmp_dir, "/tmp");
3161                 }
3162
3163                 ns_info = (NamespaceInfo) {
3164                         .ignore_protect_paths = false,
3165                         .private_dev = context->private_devices,
3166                         .protect_control_groups = context->protect_control_groups,
3167                         .protect_kernel_tunables = context->protect_kernel_tunables,
3168                         .protect_kernel_modules = context->protect_kernel_modules,
3169                         .protect_kernel_logs = context->protect_kernel_logs,
3170                         .protect_hostname = context->protect_hostname,
3171                         .mount_apivfs = exec_context_get_effective_mount_apivfs(context),
3172                         .private_mounts = context->private_mounts,
3173                         .protect_home = context->protect_home,
3174                         .protect_system = context->protect_system,
3175                         .protect_proc = context->protect_proc,
3176                         .proc_subset = context->proc_subset,
3177                 };
3178         } else if (!context->dynamic_user && root_dir)
3179                 /*
3180                  * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
3181                  * sandbox info, otherwise enforce it, don't ignore protected paths and
3182                  * fail if we are enable to apply the sandbox inside the mount namespace.
3183                  */
3184                 ns_info = (NamespaceInfo) {
3185                         .ignore_protect_paths = true,
3186                 };
3187         else
3188                 ns_info = (NamespaceInfo) {};
3189
3190         if (context->mount_flags == MS_SHARED)
3191                 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3192
3193         if (exec_context_has_credentials(context) &&
3194             params->prefix[EXEC_DIRECTORY_RUNTIME] &&
3195             FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
3196                 creds_path = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials", u->id);
3197                 if (!creds_path) {
3198                         r = -ENOMEM;
3199                         goto finalize;
3200                 }
3201         }
3202
3203         if (MANAGER_IS_SYSTEM(u->manager)) {
3204                 propagate_dir = path_join("/run/systemd/propagate/", u->id);
3205                 if (!propagate_dir)
3206                         return -ENOMEM;
3207                 incoming_dir = strdup("/run/systemd/incoming");
3208                 if (!incoming_dir)
3209                         return -ENOMEM;
3210         }
3211
3212         r = setup_namespace(root_dir, root_image, context->root_image_options,
3213                             &ns_info, context->read_write_paths,
3214                             needs_sandboxing ? context->read_only_paths : NULL,
3215                             needs_sandboxing ? context->inaccessible_paths : NULL,
3216                             needs_sandboxing ? context->exec_paths : NULL,
3217                             needs_sandboxing ? context->no_exec_paths : NULL,
3218                             empty_directories,
3219                             bind_mounts,
3220                             n_bind_mounts,
3221                             context->temporary_filesystems,
3222                             context->n_temporary_filesystems,
3223                             context->mount_images,
3224                             context->n_mount_images,
3225                             tmp_dir,
3226                             var_tmp_dir,
3227                             creds_path,
3228                             context->log_namespace,
3229                             context->mount_flags,
3230                             context->root_hash, context->root_hash_size, context->root_hash_path,
3231                             context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3232                             context->root_verity,
3233                             propagate_dir,
3234                             incoming_dir,
3235                             root_dir || root_image ? params->notify_socket : NULL,
3236                             DISSECT_IMAGE_DISCARD_ON_LOOP|DISSECT_IMAGE_RELAX_VAR_CHECK|DISSECT_IMAGE_FSCK,
3237                             error_path);
3238
3239         /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
3240          * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
3241          * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3242          * completely different execution environment. */
3243         if (r == -ENOANO) {
3244                 if (insist_on_sandboxing(
3245                                     context,
3246                                     root_dir, root_image,
3247                                     bind_mounts,
3248                                     n_bind_mounts)) {
3249                         log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
3250                                        "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3251                                        n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
3252
3253                         r = -EOPNOTSUPP;
3254                 } else {
3255                         log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
3256                         r = 0;
3257                 }
3258         }
3259
3260 finalize:
3261         bind_mount_free_many(bind_mounts, n_bind_mounts);
3262         return r;
3263 }
3264
3265 static int apply_working_directory(
3266                 const ExecContext *context,
3267                 const ExecParameters *params,
3268                 const char *home,
3269                 int *exit_status) {
3270
3271         const char *d, *wd;
3272
3273         assert(context);
3274         assert(exit_status);
3275
3276         if (context->working_directory_home) {
3277
3278                 if (!home) {
3279                         *exit_status = EXIT_CHDIR;
3280                         return -ENXIO;
3281                 }
3282
3283                 wd = home;
3284
3285         } else
3286                 wd = empty_to_root(context->working_directory);
3287
3288         if (params->flags & EXEC_APPLY_CHROOT)
3289                 d = wd;
3290         else
3291                 d = prefix_roota(context->root_directory, wd);
3292
3293         if (chdir(d) < 0 && !context->working_directory_missing_ok) {
3294                 *exit_status = EXIT_CHDIR;
3295                 return -errno;
3296         }
3297
3298         return 0;
3299 }
3300
3301 static int apply_root_directory(
3302                 const ExecContext *context,
3303                 const ExecParameters *params,
3304                 const bool needs_mount_ns,
3305                 int *exit_status) {
3306
3307         assert(context);
3308         assert(exit_status);
3309
3310         if (params->flags & EXEC_APPLY_CHROOT)
3311                 if (!needs_mount_ns && context->root_directory)
3312                         if (chroot(context->root_directory) < 0) {
3313                                 *exit_status = EXIT_CHROOT;
3314                                 return -errno;
3315                         }
3316
3317         return 0;
3318 }
3319
3320 static int setup_keyring(
3321                 const Unit *u,
3322                 const ExecContext *context,
3323                 const ExecParameters *p,
3324                 uid_t uid, gid_t gid) {
3325
3326         key_serial_t keyring;
3327         int r = 0;
3328         uid_t saved_uid;
3329         gid_t saved_gid;
3330
3331         assert(u);
3332         assert(context);
3333         assert(p);
3334
3335         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3336          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3337          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3338          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3339          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3340          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3341
3342         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3343                 return 0;
3344
3345         /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3346          * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3347          * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3348          * & group is just as nasty as acquiring a reference to the user keyring. */
3349
3350         saved_uid = getuid();
3351         saved_gid = getgid();
3352
3353         if (gid_is_valid(gid) && gid != saved_gid) {
3354                 if (setregid(gid, -1) < 0)
3355                         return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
3356         }
3357
3358         if (uid_is_valid(uid) && uid != saved_uid) {
3359                 if (setreuid(uid, -1) < 0) {
3360                         r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
3361                         goto out;
3362                 }
3363         }
3364
3365         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3366         if (keyring == -1) {
3367                 if (errno == ENOSYS)
3368                         log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
3369                 else if (ERRNO_IS_PRIVILEGE(errno))
3370                         log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
3371                 else if (errno == EDQUOT)
3372                         log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
3373                 else
3374                         r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
3375
3376                 goto out;
3377         }
3378
3379         /* When requested link the user keyring into the session keyring. */
3380         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3381
3382                 if (keyctl(KEYCTL_LINK,
3383                            KEY_SPEC_USER_KEYRING,
3384                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3385                         r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
3386                         goto out;
3387                 }
3388         }
3389
3390         /* Restore uid/gid back */
3391         if (uid_is_valid(uid) && uid != saved_uid) {
3392                 if (setreuid(saved_uid, -1) < 0) {
3393                         r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
3394                         goto out;
3395                 }
3396         }
3397
3398         if (gid_is_valid(gid) && gid != saved_gid) {
3399                 if (setregid(saved_gid, -1) < 0)
3400                         return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
3401         }
3402
3403         /* Populate they keyring with the invocation ID by default, as original saved_uid. */
3404         if (!sd_id128_is_null(u->invocation_id)) {
3405                 key_serial_t key;
3406
3407                 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
3408                 if (key == -1)
3409                         log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
3410                 else {
3411                         if (keyctl(KEYCTL_SETPERM, key,
3412                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3413                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
3414                                 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
3415                 }
3416         }
3417
3418 out:
3419         /* Revert back uid & gid for the last time, and exit */
3420         /* no extra logging, as only the first already reported error matters */
3421         if (getuid() != saved_uid)
3422                 (void) setreuid(saved_uid, -1);
3423
3424         if (getgid() != saved_gid)
3425                 (void) setregid(saved_gid, -1);
3426
3427         return r;
3428 }
3429
3430 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
3431         assert(array);
3432         assert(n);
3433         assert(pair);
3434
3435         if (pair[0] >= 0)
3436                 array[(*n)++] = pair[0];
3437         if (pair[1] >= 0)
3438                 array[(*n)++] = pair[1];
3439 }
3440
3441 static int close_remaining_fds(
3442                 const ExecParameters *params,
3443                 const ExecRuntime *runtime,
3444                 const DynamicCreds *dcreds,
3445                 int user_lookup_fd,
3446                 int socket_fd,
3447                 const int *fds, size_t n_fds) {
3448
3449         size_t n_dont_close = 0;
3450         int dont_close[n_fds + 12];
3451
3452         assert(params);
3453
3454         if (params->stdin_fd >= 0)
3455                 dont_close[n_dont_close++] = params->stdin_fd;
3456         if (params->stdout_fd >= 0)
3457                 dont_close[n_dont_close++] = params->stdout_fd;
3458         if (params->stderr_fd >= 0)
3459                 dont_close[n_dont_close++] = params->stderr_fd;
3460
3461         if (socket_fd >= 0)
3462                 dont_close[n_dont_close++] = socket_fd;
3463         if (n_fds > 0) {
3464                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3465                 n_dont_close += n_fds;
3466         }
3467
3468         if (runtime)
3469                 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
3470
3471         if (dcreds) {
3472                 if (dcreds->user)
3473                         append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
3474                 if (dcreds->group)
3475                         append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
3476         }
3477
3478         if (user_lookup_fd >= 0)
3479                 dont_close[n_dont_close++] = user_lookup_fd;
3480
3481         return close_all_fds(dont_close, n_dont_close);
3482 }
3483
3484 static int send_user_lookup(
3485                 Unit *unit,
3486                 int user_lookup_fd,
3487                 uid_t uid,
3488                 gid_t gid) {
3489
3490         assert(unit);
3491
3492         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3493          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3494          * specified. */
3495
3496         if (user_lookup_fd < 0)
3497                 return 0;
3498
3499         if (!uid_is_valid(uid) && !gid_is_valid(gid))
3500                 return 0;
3501
3502         if (writev(user_lookup_fd,
3503                (struct iovec[]) {
3504                            IOVEC_INIT(&uid, sizeof(uid)),
3505                            IOVEC_INIT(&gid, sizeof(gid)),
3506                            IOVEC_INIT_STRING(unit->id) }, 3) < 0)
3507                 return -errno;
3508
3509         return 0;
3510 }
3511
3512 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3513         int r;
3514
3515         assert(c);
3516         assert(home);
3517         assert(buf);
3518
3519         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3520
3521         if (*home)
3522                 return 0;
3523
3524         if (!c->working_directory_home)
3525                 return 0;
3526
3527         r = get_home_dir(buf);
3528         if (r < 0)
3529                 return r;
3530
3531         *home = *buf;
3532         return 1;
3533 }
3534
3535 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3536         _cleanup_strv_free_ char ** list = NULL;
3537         int r;
3538
3539         assert(c);
3540         assert(p);
3541         assert(ret);
3542
3543         assert(c->dynamic_user);
3544
3545         /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3546          * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3547          * directories. */
3548
3549         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3550                 char **i;
3551
3552                 if (t == EXEC_DIRECTORY_CONFIGURATION)
3553                         continue;
3554
3555                 if (!p->prefix[t])
3556                         continue;
3557
3558                 STRV_FOREACH(i, c->directories[t].paths) {
3559                         char *e;
3560
3561                         if (exec_directory_is_private(c, t))
3562                                 e = path_join(p->prefix[t], "private", *i);
3563                         else
3564                                 e = path_join(p->prefix[t], *i);
3565                         if (!e)
3566                                 return -ENOMEM;
3567
3568                         r = strv_consume(&list, e);
3569                         if (r < 0)
3570                                 return r;
3571                 }
3572         }
3573
3574         *ret = TAKE_PTR(list);
3575
3576         return 0;
3577 }
3578
3579 static char *exec_command_line(char **argv);
3580
3581 static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
3582         bool using_subcgroup;
3583         char *p;
3584
3585         assert(params);
3586         assert(ret);
3587
3588         if (!params->cgroup_path)
3589                 return -EINVAL;
3590
3591         /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
3592          * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
3593          * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
3594          * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
3595          * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
3596          * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
3597          * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
3598          * flag, which is only passed for the former statements, not for the latter. */
3599
3600         using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
3601         if (using_subcgroup)
3602                 p = path_join(params->cgroup_path, ".control");
3603         else
3604                 p = strdup(params->cgroup_path);
3605         if (!p)
3606                 return -ENOMEM;
3607
3608         *ret = p;
3609         return using_subcgroup;
3610 }
3611
3612 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
3613         _cleanup_(cpu_set_reset) CPUSet s = {};
3614         int r;
3615
3616         assert(c);
3617         assert(ret);
3618
3619         if (!c->numa_policy.nodes.set) {
3620                 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3621                 return 0;
3622         }
3623
3624         r = numa_to_cpu_set(&c->numa_policy, &s);
3625         if (r < 0)
3626                 return r;
3627
3628         cpu_set_reset(ret);
3629
3630         return cpu_set_add_all(ret, &s);
3631 }
3632
3633 bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
3634         assert(c);
3635
3636         return c->cpu_affinity_from_numa;
3637 }
3638
3639 static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
3640         int r;
3641
3642         assert(fds);
3643         assert(n_fds);
3644         assert(*n_fds < fds_size);
3645         assert(ret_fd);
3646
3647         if (fd < 0) {
3648                 *ret_fd = -1;
3649                 return 0;
3650         }
3651
3652         if (fd < 3 + (int) *n_fds) {
3653                 /* Let's move the fd up, so that it's outside of the fd range we will use to store
3654                  * the fds we pass to the process (or which are closed only during execve). */
3655
3656                 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
3657                 if (r < 0)
3658                         return -errno;
3659
3660                 CLOSE_AND_REPLACE(fd, r);
3661         }
3662
3663         *ret_fd = fds[*n_fds] = fd;
3664         (*n_fds) ++;
3665         return 1;
3666 }
3667
3668 static int exec_child(
3669                 Unit *unit,
3670                 const ExecCommand *command,
3671                 const ExecContext *context,
3672                 const ExecParameters *params,
3673                 ExecRuntime *runtime,
3674                 DynamicCreds *dcreds,
3675                 int socket_fd,
3676                 const int named_iofds[static 3],
3677                 int *fds,
3678                 size_t n_socket_fds,
3679                 size_t n_storage_fds,
3680                 char **files_env,
3681                 int user_lookup_fd,
3682                 int *exit_status) {
3683
3684         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **replaced_argv = NULL;
3685         int r, ngids = 0, exec_fd;
3686         _cleanup_free_ gid_t *supplementary_gids = NULL;
3687         const char *username = NULL, *groupname = NULL;
3688         _cleanup_free_ char *home_buffer = NULL;
3689         const char *home = NULL, *shell = NULL;
3690         char **final_argv = NULL;
3691         dev_t journal_stream_dev = 0;
3692         ino_t journal_stream_ino = 0;
3693         bool userns_set_up = false;
3694         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
3695                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
3696                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
3697                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
3698 #if HAVE_SELINUX
3699         _cleanup_free_ char *mac_selinux_context_net = NULL;
3700         bool use_selinux = false;
3701 #endif
3702 #if ENABLE_SMACK
3703         bool use_smack = false;
3704 #endif
3705 #if HAVE_APPARMOR
3706         bool use_apparmor = false;
3707 #endif
3708         uid_t saved_uid = getuid();
3709         gid_t saved_gid = getgid();
3710         uid_t uid = UID_INVALID;
3711         gid_t gid = GID_INVALID;
3712         size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
3713                n_keep_fds; /* total number of fds not to close */
3714         int secure_bits;
3715         _cleanup_free_ gid_t *gids_after_pam = NULL;
3716         int ngids_after_pam = 0;
3717
3718         assert(unit);
3719         assert(command);
3720         assert(context);
3721         assert(params);
3722         assert(exit_status);
3723
3724         rename_process_from_path(command->path);
3725
3726         /* We reset exactly these signals, since they are the
3727          * only ones we set to SIG_IGN in the main daemon. All
3728          * others we leave untouched because we set them to
3729          * SIG_DFL or a valid handler initially, both of which
3730          * will be demoted to SIG_DFL. */
3731         (void) default_signals(SIGNALS_CRASH_HANDLER,
3732                                SIGNALS_IGNORE, -1);
3733
3734         if (context->ignore_sigpipe)
3735                 (void) ignore_signals(SIGPIPE, -1);
3736
3737         r = reset_signal_mask();
3738         if (r < 0) {
3739                 *exit_status = EXIT_SIGNAL_MASK;
3740                 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
3741         }
3742
3743         if (params->idle_pipe)
3744                 do_idle_pipe_dance(params->idle_pipe);
3745
3746         /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
3747          * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
3748          * any fds open we don't really want open during the transition. In order to make logging work, we switch the
3749          * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
3750
3751         log_forget_fds();
3752         log_set_open_when_needed(true);
3753
3754         /* In case anything used libc syslog(), close this here, too */
3755         closelog();
3756
3757         int keep_fds[n_fds + 2];
3758         memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
3759         n_keep_fds = n_fds;
3760
3761         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
3762         if (r < 0) {
3763                 *exit_status = EXIT_FDS;
3764                 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
3765         }
3766
3767         r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
3768         if (r < 0) {
3769                 *exit_status = EXIT_FDS;
3770                 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
3771         }
3772
3773         if (!context->same_pgrp &&
3774             setsid() < 0) {
3775                 *exit_status = EXIT_SETSID;
3776                 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
3777         }
3778
3779         exec_context_tty_reset(context, params);
3780
3781         if (unit_shall_confirm_spawn(unit)) {
3782                 const char *vc = params->confirm_spawn;
3783                 _cleanup_free_ char *cmdline = NULL;
3784
3785                 cmdline = exec_command_line(command->argv);
3786                 if (!cmdline) {
3787                         *exit_status = EXIT_MEMORY;
3788                         return log_oom();
3789                 }
3790
3791                 r = ask_for_confirmation(vc, unit, cmdline);
3792                 if (r != CONFIRM_EXECUTE) {
3793                         if (r == CONFIRM_PRETEND_SUCCESS) {
3794                                 *exit_status = EXIT_SUCCESS;
3795                                 return 0;
3796                         }
3797                         *exit_status = EXIT_CONFIRM;
3798                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
3799                                                     "Execution cancelled by the user");
3800                 }
3801         }
3802
3803         /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
3804          * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
3805          * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
3806          * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
3807          * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
3808         if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
3809             setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
3810                 *exit_status = EXIT_MEMORY;
3811                 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3812         }
3813
3814         if (context->dynamic_user && dcreds) {
3815                 _cleanup_strv_free_ char **suggested_paths = NULL;
3816
3817                 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
3818                  * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here.*/
3819                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
3820                         *exit_status = EXIT_USER;
3821                         return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3822                 }
3823
3824                 r = compile_suggested_paths(context, params, &suggested_paths);
3825                 if (r < 0) {
3826                         *exit_status = EXIT_MEMORY;
3827                         return log_oom();
3828                 }
3829
3830                 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
3831                 if (r < 0) {
3832                         *exit_status = EXIT_USER;
3833                         if (r == -EILSEQ)
3834                                 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
3835                                                             "Failed to update dynamic user credentials: User or group with specified name already exists.");
3836                         return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
3837                 }
3838
3839                 if (!uid_is_valid(uid)) {
3840                         *exit_status = EXIT_USER;
3841                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
3842                 }
3843
3844                 if (!gid_is_valid(gid)) {
3845                         *exit_status = EXIT_USER;
3846                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
3847                 }
3848
3849                 if (dcreds->user)
3850                         username = dcreds->user->name;
3851
3852         } else {
3853                 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
3854                 if (r < 0) {
3855                         *exit_status = EXIT_USER;
3856                         return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
3857                 }
3858
3859                 r = get_fixed_group(context, &groupname, &gid);
3860                 if (r < 0) {
3861                         *exit_status = EXIT_GROUP;
3862                         return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
3863                 }
3864         }
3865
3866         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
3867         r = get_supplementary_groups(context, username, groupname, gid,
3868                                      &supplementary_gids, &ngids);
3869         if (r < 0) {
3870                 *exit_status = EXIT_GROUP;
3871                 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
3872         }
3873
3874         r = send_user_lookup(unit, user_lookup_fd, uid, gid);
3875         if (r < 0) {
3876                 *exit_status = EXIT_USER;
3877                 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
3878         }
3879
3880         user_lookup_fd = safe_close(user_lookup_fd);
3881
3882         r = acquire_home(context, uid, &home, &home_buffer);
3883         if (r < 0) {
3884                 *exit_status = EXIT_CHDIR;
3885                 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
3886         }
3887
3888         /* If a socket is connected to STDIN/STDOUT/STDERR, we
3889          * must sure to drop O_NONBLOCK */
3890         if (socket_fd >= 0)
3891                 (void) fd_nonblock(socket_fd, false);
3892
3893         /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
3894          * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
3895         if (params->cgroup_path) {
3896                 _cleanup_free_ char *p = NULL;
3897
3898                 r = exec_parameters_get_cgroup_path(params, &p);
3899                 if (r < 0) {
3900                         *exit_status = EXIT_CGROUP;
3901                         return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
3902                 }
3903
3904                 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
3905                 if (r < 0) {
3906                         *exit_status = EXIT_CGROUP;
3907                         return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
3908                 }
3909         }
3910
3911         if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
3912                 r = open_netns_path(runtime->netns_storage_socket, context->network_namespace_path);
3913                 if (r < 0) {
3914                         *exit_status = EXIT_NETWORK;
3915                         return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
3916                 }
3917         }
3918
3919         r = setup_input(context, params, socket_fd, named_iofds);
3920         if (r < 0) {
3921                 *exit_status = EXIT_STDIN;
3922                 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
3923         }
3924
3925         r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3926         if (r < 0) {
3927                 *exit_status = EXIT_STDOUT;
3928                 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
3929         }
3930
3931         r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3932         if (r < 0) {
3933                 *exit_status = EXIT_STDERR;
3934                 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
3935         }
3936
3937         if (context->oom_score_adjust_set) {
3938                 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
3939                  * prohibit write access to this file, and we shouldn't trip up over that. */
3940                 r = set_oom_score_adjust(context->oom_score_adjust);
3941                 if (ERRNO_IS_PRIVILEGE(r))
3942                         log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
3943                 else if (r < 0) {
3944                         *exit_status = EXIT_OOM_ADJUST;
3945                         return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
3946                 }
3947         }
3948
3949         if (context->coredump_filter_set) {
3950                 r = set_coredump_filter(context->coredump_filter);
3951                 if (ERRNO_IS_PRIVILEGE(r))
3952                         log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
3953                 else if (r < 0)
3954                         return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
3955         }
3956
3957         if (context->nice_set) {
3958                 r = setpriority_closest(context->nice);
3959                 if (r < 0)
3960                         return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
3961         }
3962
3963         if (context->cpu_sched_set) {
3964                 struct sched_param param = {
3965                         .sched_priority = context->cpu_sched_priority,
3966                 };
3967
3968                 r = sched_setscheduler(0,
3969                                        context->cpu_sched_policy |
3970                                        (context->cpu_sched_reset_on_fork ?
3971                                         SCHED_RESET_ON_FORK : 0),
3972                                        &param);
3973                 if (r < 0) {
3974                         *exit_status = EXIT_SETSCHEDULER;
3975                         return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
3976                 }
3977         }
3978
3979         if (context->cpu_affinity_from_numa || context->cpu_set.set) {
3980                 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
3981                 const CPUSet *cpu_set;
3982
3983                 if (context->cpu_affinity_from_numa) {
3984                         r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
3985                         if (r < 0) {
3986                                 *exit_status = EXIT_CPUAFFINITY;
3987                                 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
3988                         }
3989
3990                         cpu_set = &converted_cpu_set;
3991                 } else
3992                         cpu_set = &context->cpu_set;
3993
3994                 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
3995                         *exit_status = EXIT_CPUAFFINITY;
3996                         return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
3997                 }
3998         }
3999
4000         if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4001                 r = apply_numa_policy(&context->numa_policy);
4002                 if (r == -EOPNOTSUPP)
4003                         log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
4004                 else if (r < 0) {
4005                         *exit_status = EXIT_NUMA_POLICY;
4006                         return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
4007                 }
4008         }
4009
4010         if (context->ioprio_set)
4011                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
4012                         *exit_status = EXIT_IOPRIO;
4013                         return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
4014                 }
4015
4016         if (context->timer_slack_nsec != NSEC_INFINITY)
4017                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
4018                         *exit_status = EXIT_TIMERSLACK;
4019                         return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
4020                 }
4021
4022         if (context->personality != PERSONALITY_INVALID) {
4023                 r = safe_personality(context->personality);
4024                 if (r < 0) {
4025                         *exit_status = EXIT_PERSONALITY;
4026                         return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
4027                 }
4028         }
4029
4030         if (context->utmp_id)
4031                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
4032                                       context->tty_path,
4033                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
4034                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4035                                       USER_PROCESS,
4036                                       username);
4037
4038         if (uid_is_valid(uid)) {
4039                 r = chown_terminal(STDIN_FILENO, uid);
4040                 if (r < 0) {
4041                         *exit_status = EXIT_STDIN;
4042                         return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
4043                 }
4044         }
4045
4046         /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
4047          * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4048          * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
4049          * touch a single hierarchy too. */
4050         if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
4051                 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
4052                 if (r < 0) {
4053                         *exit_status = EXIT_CGROUP;
4054                         return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
4055                 }
4056         }
4057
4058         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4059                 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
4060                 if (r < 0)
4061                         return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
4062         }
4063
4064         if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4065                 r = setup_credentials(context, params, unit->id, uid);
4066                 if (r < 0) {
4067                         *exit_status = EXIT_CREDENTIALS;
4068                         return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
4069                 }
4070         }
4071
4072         r = build_environment(
4073                         unit,
4074                         context,
4075                         params,
4076                         n_fds,
4077                         home,
4078                         username,
4079                         shell,
4080                         journal_stream_dev,
4081                         journal_stream_ino,
4082                         &our_env);
4083         if (r < 0) {
4084                 *exit_status = EXIT_MEMORY;
4085                 return log_oom();
4086         }
4087
4088         r = build_pass_environment(context, &pass_env);
4089         if (r < 0) {
4090                 *exit_status = EXIT_MEMORY;
4091                 return log_oom();
4092         }
4093
4094         accum_env = strv_env_merge(5,
4095                                    params->environment,
4096                                    our_env,
4097                                    pass_env,
4098                                    context->environment,
4099                                    files_env);
4100         if (!accum_env) {
4101                 *exit_status = EXIT_MEMORY;
4102                 return log_oom();
4103         }
4104         accum_env = strv_env_clean(accum_env);
4105
4106         (void) umask(context->umask);
4107
4108         r = setup_keyring(unit, context, params, uid, gid);
4109         if (r < 0) {
4110                 *exit_status = EXIT_KEYRING;
4111                 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
4112         }
4113
4114         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
4115         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
4116
4117         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
4118         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
4119
4120         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
4121         if (needs_ambient_hack)
4122                 needs_setuid = false;
4123         else
4124                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4125
4126         if (needs_sandboxing) {
4127                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
4128                  * present. The actual MAC context application will happen later, as late as possible, to avoid
4129                  * impacting our own code paths. */
4130
4131 #if HAVE_SELINUX
4132                 use_selinux = mac_selinux_use();
4133 #endif
4134 #if ENABLE_SMACK
4135                 use_smack = mac_smack_use();
4136 #endif
4137 #if HAVE_APPARMOR
4138                 use_apparmor = mac_apparmor_use();
4139 #endif
4140         }
4141
4142         if (needs_sandboxing) {
4143                 int which_failed;
4144
4145                 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4146                  * is set here. (See below.) */
4147
4148                 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4149                 if (r < 0) {
4150                         *exit_status = EXIT_LIMITS;
4151                         return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4152                 }
4153         }
4154
4155         if (needs_setuid && context->pam_name && username) {
4156                 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
4157                  * wins here. (See above.) */
4158
4159                 /* All fds passed in the fds array will be closed in the pam child process. */
4160                 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
4161                 if (r < 0) {
4162                         *exit_status = EXIT_PAM;
4163                         return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
4164                 }
4165
4166                 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4167                 if (ngids_after_pam < 0) {
4168                         *exit_status = EXIT_MEMORY;
4169                         return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
4170                 }
4171         }
4172
4173         if (needs_sandboxing && context->private_users && !have_effective_cap(CAP_SYS_ADMIN)) {
4174                 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4175                  * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4176                  * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
4177
4178                 userns_set_up = true;
4179                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4180                 if (r < 0) {
4181                         *exit_status = EXIT_USER;
4182                         return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
4183                 }
4184         }
4185
4186         if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
4187
4188                 if (ns_type_supported(NAMESPACE_NET)) {
4189                         r = setup_netns(runtime->netns_storage_socket);
4190                         if (r == -EPERM)
4191                                 log_unit_warning_errno(unit, r,
4192                                                        "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
4193                         else if (r < 0) {
4194                                 *exit_status = EXIT_NETWORK;
4195                                 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
4196                         }
4197                 } else if (context->network_namespace_path) {
4198                         *exit_status = EXIT_NETWORK;
4199                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4200                                                     "NetworkNamespacePath= is not supported, refusing.");
4201                 } else
4202                         log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
4203         }
4204
4205         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
4206         if (needs_mount_namespace) {
4207                 _cleanup_free_ char *error_path = NULL;
4208
4209                 r = apply_mount_namespace(unit, command->flags, context, params, runtime, &error_path);
4210                 if (r < 0) {
4211                         *exit_status = EXIT_NAMESPACE;
4212                         return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
4213                                                     error_path ? ": " : "", strempty(error_path));
4214                 }
4215         }
4216
4217         if (needs_sandboxing) {
4218                 r = apply_protect_hostname(unit, context, exit_status);
4219                 if (r < 0)
4220                         return r;
4221         }
4222
4223         /* Drop groups as early as possible.
4224          * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
4225          * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
4226         if (needs_setuid) {
4227                 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4228                 int ngids_to_enforce = 0;
4229
4230                 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4231                                                    ngids,
4232                                                    gids_after_pam,
4233                                                    ngids_after_pam,
4234                                                    &gids_to_enforce);
4235                 if (ngids_to_enforce < 0) {
4236                         *exit_status = EXIT_MEMORY;
4237                         return log_unit_error_errno(unit,
4238                                                     ngids_to_enforce,
4239                                                     "Failed to merge group lists. Group membership might be incorrect: %m");
4240                 }
4241
4242                 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
4243                 if (r < 0) {
4244                         *exit_status = EXIT_GROUP;
4245                         return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
4246                 }
4247         }
4248
4249         /* If the user namespace was not set up above, try to do it now.
4250          * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4251          * restricted by rules pertaining to combining user namspaces with other namespaces (e.g. in the
4252          * case of mount namespaces being less privileged when the mount point list is copied from a
4253          * different user namespace). */
4254
4255         if (needs_sandboxing && context->private_users && !userns_set_up) {
4256                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4257                 if (r < 0) {
4258                         *exit_status = EXIT_USER;
4259                         return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
4260                 }
4261         }
4262
4263         /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4264          * shall execute. */
4265
4266         _cleanup_free_ char *executable = NULL;
4267         _cleanup_close_ int executable_fd = -1;
4268         r = find_executable_full(command->path, false, &executable, &executable_fd);
4269         if (r < 0) {
4270                 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
4271                         log_struct_errno(LOG_INFO, r,
4272                                          "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4273                                          LOG_UNIT_ID(unit),
4274                                          LOG_UNIT_INVOCATION_ID(unit),
4275                                          LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
4276                                                           command->path),
4277                                          "EXECUTABLE=%s", command->path);
4278                         return 0;
4279                 }
4280
4281                 *exit_status = EXIT_EXEC;
4282                 return log_struct_errno(LOG_INFO, r,
4283                                         "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4284                                         LOG_UNIT_ID(unit),
4285                                         LOG_UNIT_INVOCATION_ID(unit),
4286                                         LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
4287                                                          command->path),
4288                                         "EXECUTABLE=%s", command->path);
4289         }
4290
4291         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
4292         if (r < 0) {
4293                 *exit_status = EXIT_FDS;
4294                 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4295         }
4296
4297 #if HAVE_SELINUX
4298         if (needs_sandboxing && use_selinux && params->selinux_context_net && socket_fd >= 0) {
4299                 r = mac_selinux_get_child_mls_label(socket_fd, executable, context->selinux_context, &mac_selinux_context_net);
4300                 if (r < 0) {
4301                         *exit_status = EXIT_SELINUX_CONTEXT;
4302                         return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
4303                 }
4304         }
4305 #endif
4306
4307         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
4308          * more aggressive this time since socket_fd and the netns fds we don't need anymore. We do keep the exec_fd
4309          * however if we have it as we want to keep it open until the final execve(). */
4310
4311         r = close_all_fds(keep_fds, n_keep_fds);
4312         if (r >= 0)
4313                 r = shift_fds(fds, n_fds);
4314         if (r >= 0)
4315                 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
4316         if (r < 0) {
4317                 *exit_status = EXIT_FDS;
4318                 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
4319         }
4320
4321         /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4322          * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4323          * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4324          * came this far. */
4325
4326         secure_bits = context->secure_bits;
4327
4328         if (needs_sandboxing) {
4329                 uint64_t bset;
4330
4331                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
4332                  * requested. (Note this is placed after the general resource limit initialization, see
4333                  * above, in order to take precedence.) */
4334                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
4335                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
4336                                 *exit_status = EXIT_LIMITS;
4337                                 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
4338                         }
4339                 }
4340
4341 #if ENABLE_SMACK
4342                 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4343                  * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4344                 if (use_smack) {
4345                         r = setup_smack(context, executable_fd);
4346                         if (r < 0) {
4347                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
4348                                 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
4349                         }
4350                 }
4351 #endif
4352
4353                 bset = context->capability_bounding_set;
4354                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4355                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4356                  * instead of us doing that */
4357                 if (needs_ambient_hack)
4358                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
4359                                 (UINT64_C(1) << CAP_SETUID) |
4360                                 (UINT64_C(1) << CAP_SETGID);
4361
4362                 if (!cap_test_all(bset)) {
4363                         r = capability_bounding_set_drop(bset, false);
4364                         if (r < 0) {
4365                                 *exit_status = EXIT_CAPABILITIES;
4366                                 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
4367                         }
4368                 }
4369
4370                 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4371                  * keep-caps set.
4372                  * To be able to raise the ambient capabilities after setresuid() they have to be
4373                  * added to the inherited set and keep caps has to be set (done in enforce_user()).
4374                  * After setresuid() the ambient capabilities can be raised as they are present in
4375                  * the permitted and inhertiable set. However it is possible that someone wants to
4376                  * set ambient capabilities without changing the user, so we also set the ambient
4377                  * capabilities here.
4378                  * The requested ambient capabilities are raised in the inheritable set if the
4379                  * second argument is true. */
4380                 if (!needs_ambient_hack) {
4381                         r = capability_ambient_set_apply(context->capability_ambient_set, true);
4382                         if (r < 0) {
4383                                 *exit_status = EXIT_CAPABILITIES;
4384                                 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
4385                         }
4386                 }
4387         }
4388
4389         /* chroot to root directory first, before we lose the ability to chroot */
4390         r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
4391         if (r < 0)
4392                 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
4393
4394         if (needs_setuid) {
4395                 if (uid_is_valid(uid)) {
4396                         r = enforce_user(context, uid);
4397                         if (r < 0) {
4398                                 *exit_status = EXIT_USER;
4399                                 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
4400                         }
4401
4402                         if (!needs_ambient_hack &&
4403                             context->capability_ambient_set != 0) {
4404
4405                                 /* Raise the ambient capabilities after user change. */
4406                                 r = capability_ambient_set_apply(context->capability_ambient_set, false);
4407                                 if (r < 0) {
4408                                         *exit_status = EXIT_CAPABILITIES;
4409                                         return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
4410                                 }
4411                         }
4412                 }
4413         }
4414
4415         /* Apply working directory here, because the working directory might be on NFS and only the user running
4416          * this service might have the correct privilege to change to the working directory */
4417         r = apply_working_directory(context, params, home, exit_status);
4418         if (r < 0)
4419                 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
4420
4421         if (needs_sandboxing) {
4422                 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
4423                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
4424                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
4425                  * are restricted. */
4426
4427 #if HAVE_SELINUX
4428                 if (use_selinux) {
4429                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
4430
4431                         if (exec_context) {
4432                                 r = setexeccon(exec_context);
4433                                 if (r < 0) {
4434                                         *exit_status = EXIT_SELINUX_CONTEXT;
4435                                         return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
4436                                 }
4437                         }
4438                 }
4439 #endif
4440
4441 #if HAVE_APPARMOR
4442                 if (use_apparmor && context->apparmor_profile) {
4443                         r = aa_change_onexec(context->apparmor_profile);
4444                         if (r < 0 && !context->apparmor_profile_ignore) {
4445                                 *exit_status = EXIT_APPARMOR_PROFILE;
4446                                 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
4447                         }
4448                 }
4449 #endif
4450
4451                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
4452                  * we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits requires
4453                  * CAP_SETPCAP. */
4454                 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
4455                         /* CAP_SETPCAP is required to set securebits. This capability is raised into the
4456                          * effective set here.
4457                          * The effective set is overwritten during execve  with the following  values:
4458                          * - ambient set (for non-root processes)
4459                          * - (inheritable | bounding) set for root processes)
4460                          *
4461                          * Hence there is no security impact to raise it in the effective set before execve
4462                          */
4463                         r = capability_gain_cap_setpcap(NULL);
4464                         if (r < 0) {
4465                                 *exit_status = EXIT_CAPABILITIES;
4466                                 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
4467                         }
4468                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
4469                                 *exit_status = EXIT_SECUREBITS;
4470                                 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
4471                         }
4472                 }
4473
4474                 if (context_has_no_new_privileges(context))
4475                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
4476                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
4477                                 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
4478                         }
4479
4480 #if HAVE_SECCOMP
4481                 r = apply_address_families(unit, context);
4482                 if (r < 0) {
4483                         *exit_status = EXIT_ADDRESS_FAMILIES;
4484                         return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
4485                 }
4486
4487                 r = apply_memory_deny_write_execute(unit, context);
4488                 if (r < 0) {
4489                         *exit_status = EXIT_SECCOMP;
4490                         return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
4491                 }
4492
4493                 r = apply_restrict_realtime(unit, context);
4494                 if (r < 0) {
4495                         *exit_status = EXIT_SECCOMP;
4496                         return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
4497                 }
4498
4499                 r = apply_restrict_suid_sgid(unit, context);
4500                 if (r < 0) {
4501                         *exit_status = EXIT_SECCOMP;
4502                         return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
4503                 }
4504
4505                 r = apply_restrict_namespaces(unit, context);
4506                 if (r < 0) {
4507                         *exit_status = EXIT_SECCOMP;
4508                         return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
4509                 }
4510
4511                 r = apply_protect_sysctl(unit, context);
4512                 if (r < 0) {
4513                         *exit_status = EXIT_SECCOMP;
4514                         return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
4515                 }
4516
4517                 r = apply_protect_kernel_modules(unit, context);
4518                 if (r < 0) {
4519                         *exit_status = EXIT_SECCOMP;
4520                         return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
4521                 }
4522
4523                 r = apply_protect_kernel_logs(unit, context);
4524                 if (r < 0) {
4525                         *exit_status = EXIT_SECCOMP;
4526                         return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
4527                 }
4528
4529                 r = apply_protect_clock(unit, context);
4530                 if (r < 0) {
4531                         *exit_status = EXIT_SECCOMP;
4532                         return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
4533                 }
4534
4535                 r = apply_private_devices(unit, context);
4536                 if (r < 0) {
4537                         *exit_status = EXIT_SECCOMP;
4538                         return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
4539                 }
4540
4541                 r = apply_syscall_archs(unit, context);
4542                 if (r < 0) {
4543                         *exit_status = EXIT_SECCOMP;
4544                         return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
4545                 }
4546
4547                 r = apply_lock_personality(unit, context);
4548                 if (r < 0) {
4549                         *exit_status = EXIT_SECCOMP;
4550                         return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
4551                 }
4552
4553                 r = apply_syscall_log(unit, context);
4554                 if (r < 0) {
4555                         *exit_status = EXIT_SECCOMP;
4556                         return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
4557                 }
4558
4559                 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
4560                  * by the filter as little as possible. */
4561                 r = apply_syscall_filter(unit, context, needs_ambient_hack);
4562                 if (r < 0) {
4563                         *exit_status = EXIT_SECCOMP;
4564                         return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
4565                 }
4566 #endif
4567         }
4568
4569         if (!strv_isempty(context->unset_environment)) {
4570                 char **ee = NULL;
4571
4572                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
4573                 if (!ee) {
4574                         *exit_status = EXIT_MEMORY;
4575                         return log_oom();
4576                 }
4577
4578                 strv_free_and_replace(accum_env, ee);
4579         }
4580
4581         if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
4582                 replaced_argv = replace_env_argv(command->argv, accum_env);
4583                 if (!replaced_argv) {
4584                         *exit_status = EXIT_MEMORY;
4585                         return log_oom();
4586                 }
4587                 final_argv = replaced_argv;
4588         } else
4589                 final_argv = command->argv;
4590
4591         if (DEBUG_LOGGING) {
4592                 _cleanup_free_ char *line;
4593
4594                 line = exec_command_line(final_argv);
4595                 if (line)
4596                         log_struct(LOG_DEBUG,
4597                                    "EXECUTABLE=%s", executable,
4598                                    LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
4599                                    LOG_UNIT_ID(unit),
4600                                    LOG_UNIT_INVOCATION_ID(unit));
4601         }
4602
4603         if (exec_fd >= 0) {
4604                 uint8_t hot = 1;
4605
4606                 /* We have finished with all our initializations. Let's now let the manager know that. From this point
4607                  * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
4608
4609                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
4610                         *exit_status = EXIT_EXEC;
4611                         return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
4612                 }
4613         }
4614
4615         r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
4616
4617         if (exec_fd >= 0) {
4618                 uint8_t hot = 0;
4619
4620                 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
4621                  * that POLLHUP on it no longer means execve() succeeded. */
4622
4623                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
4624                         *exit_status = EXIT_EXEC;
4625                         return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
4626                 }
4627         }
4628
4629         *exit_status = EXIT_EXEC;
4630         return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
4631 }
4632
4633 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
4634 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
4635
4636 int exec_spawn(Unit *unit,
4637                ExecCommand *command,
4638                const ExecContext *context,
4639                const ExecParameters *params,
4640                ExecRuntime *runtime,
4641                DynamicCreds *dcreds,
4642                pid_t *ret) {
4643
4644         int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
4645         _cleanup_free_ char *subcgroup_path = NULL;
4646         _cleanup_strv_free_ char **files_env = NULL;
4647         size_t n_storage_fds = 0, n_socket_fds = 0;
4648         _cleanup_free_ char *line = NULL;
4649         pid_t pid;
4650
4651         assert(unit);
4652         assert(command);
4653         assert(context);
4654         assert(ret);
4655         assert(params);
4656         assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
4657
4658         if (context->std_input == EXEC_INPUT_SOCKET ||
4659             context->std_output == EXEC_OUTPUT_SOCKET ||
4660             context->std_error == EXEC_OUTPUT_SOCKET) {
4661
4662                 if (params->n_socket_fds > 1)
4663                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
4664
4665                 if (params->n_socket_fds == 0)
4666                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
4667
4668                 socket_fd = params->fds[0];
4669         } else {
4670                 socket_fd = -1;
4671                 fds = params->fds;
4672                 n_socket_fds = params->n_socket_fds;
4673                 n_storage_fds = params->n_storage_fds;
4674         }
4675
4676         r = exec_context_named_iofds(context, params, named_iofds);
4677         if (r < 0)
4678                 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
4679
4680         r = exec_context_load_environment(unit, context, &files_env);
4681         if (r < 0)
4682                 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
4683
4684         line = exec_command_line(command->argv);
4685         if (!line)
4686                 return log_oom();
4687
4688         /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
4689            and, until the next SELinux policy changes, we save further reloads in future children. */
4690         mac_selinux_maybe_reload();
4691
4692         log_struct(LOG_DEBUG,
4693                    LOG_UNIT_MESSAGE(unit, "About to execute %s", line),
4694                    "EXECUTABLE=%s", command->path, /* We won't know the real executable path until we create
4695                                                       the mount namespace in the child, but we want to log
4696                                                       from the parent, so we need to use the (possibly
4697                                                       inaccurate) path here. */
4698                    LOG_UNIT_ID(unit),
4699                    LOG_UNIT_INVOCATION_ID(unit));
4700
4701         if (params->cgroup_path) {
4702                 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
4703                 if (r < 0)
4704                         return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
4705                 if (r > 0) { /* We are using a child cgroup */
4706                         r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
4707                         if (r < 0)
4708                                 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
4709
4710                         /* Normally we would not propagate the oomd xattrs to children but since we created this
4711                          * sub-cgroup internally we should do it. */
4712                         cgroup_oomd_xattr_apply(unit, subcgroup_path);
4713                 }
4714         }
4715
4716         pid = fork();
4717         if (pid < 0)
4718                 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
4719
4720         if (pid == 0) {
4721                 int exit_status = EXIT_SUCCESS;
4722
4723                 r = exec_child(unit,
4724                                command,
4725                                context,
4726                                params,
4727                                runtime,
4728                                dcreds,
4729                                socket_fd,
4730                                named_iofds,
4731                                fds,
4732                                n_socket_fds,
4733                                n_storage_fds,
4734                                files_env,
4735                                unit->manager->user_lookup_fds[1],
4736                                &exit_status);
4737
4738                 if (r < 0) {
4739                         const char *status =
4740                                 exit_status_to_string(exit_status,
4741                                                       EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
4742
4743                         log_struct_errno(LOG_ERR, r,
4744                                          "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4745                                          LOG_UNIT_ID(unit),
4746                                          LOG_UNIT_INVOCATION_ID(unit),
4747                                          LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
4748                                                           status, command->path),
4749                                          "EXECUTABLE=%s", command->path);
4750                 }
4751
4752                 _exit(exit_status);
4753         }
4754
4755         log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
4756
4757         /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
4758          * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
4759          * process will be killed too). */
4760         if (subcgroup_path)
4761                 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
4762
4763         exec_status_start(&command->exec_status, pid);
4764
4765         *ret = pid;
4766         return 0;
4767 }
4768
4769 void exec_context_init(ExecContext *c) {
4770         assert(c);
4771
4772         c->umask = 0022;
4773         c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
4774         c->cpu_sched_policy = SCHED_OTHER;
4775         c->syslog_priority = LOG_DAEMON|LOG_INFO;
4776         c->syslog_level_prefix = true;
4777         c->ignore_sigpipe = true;
4778         c->timer_slack_nsec = NSEC_INFINITY;
4779         c->personality = PERSONALITY_INVALID;
4780         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
4781                 c->directories[t].mode = 0755;
4782         c->timeout_clean_usec = USEC_INFINITY;
4783         c->capability_bounding_set = CAP_ALL;
4784         assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
4785         c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
4786         c->log_level_max = -1;
4787 #if HAVE_SECCOMP
4788         c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL;
4789 #endif
4790         numa_policy_reset(&c->numa_policy);
4791 }
4792
4793 void exec_context_done(ExecContext *c) {
4794         assert(c);
4795
4796         c->environment = strv_free(c->environment);
4797         c->environment_files = strv_free(c->environment_files);
4798         c->pass_environment = strv_free(c->pass_environment);
4799         c->unset_environment = strv_free(c->unset_environment);
4800
4801         rlimit_free_all(c->rlimit);
4802
4803         for (size_t l = 0; l < 3; l++) {
4804                 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
4805                 c->stdio_file[l] = mfree(c->stdio_file[l]);
4806         }
4807
4808         c->working_directory = mfree(c->working_directory);
4809         c->root_directory = mfree(c->root_directory);
4810         c->root_image = mfree(c->root_image);
4811         c->root_image_options = mount_options_free_all(c->root_image_options);
4812         c->root_hash = mfree(c->root_hash);
4813         c->root_hash_size = 0;
4814         c->root_hash_path = mfree(c->root_hash_path);
4815         c->root_hash_sig = mfree(c->root_hash_sig);
4816         c->root_hash_sig_size = 0;
4817         c->root_hash_sig_path = mfree(c->root_hash_sig_path);
4818         c->root_verity = mfree(c->root_verity);
4819         c->tty_path = mfree(c->tty_path);
4820         c->syslog_identifier = mfree(c->syslog_identifier);
4821         c->user = mfree(c->user);
4822         c->group = mfree(c->group);
4823
4824         c->supplementary_groups = strv_free(c->supplementary_groups);
4825
4826         c->pam_name = mfree(c->pam_name);
4827
4828         c->read_only_paths = strv_free(c->read_only_paths);
4829         c->read_write_paths = strv_free(c->read_write_paths);
4830         c->inaccessible_paths = strv_free(c->inaccessible_paths);
4831         c->exec_paths = strv_free(c->exec_paths);
4832         c->no_exec_paths = strv_free(c->no_exec_paths);
4833
4834         bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
4835         c->bind_mounts = NULL;
4836         c->n_bind_mounts = 0;
4837         temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
4838         c->temporary_filesystems = NULL;
4839         c->n_temporary_filesystems = 0;
4840         c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
4841
4842         cpu_set_reset(&c->cpu_set);
4843         numa_policy_reset(&c->numa_policy);
4844
4845         c->utmp_id = mfree(c->utmp_id);
4846         c->selinux_context = mfree(c->selinux_context);
4847         c->apparmor_profile = mfree(c->apparmor_profile);
4848         c->smack_process_label = mfree(c->smack_process_label);
4849
4850         c->syscall_filter = hashmap_free(c->syscall_filter);
4851         c->syscall_archs = set_free(c->syscall_archs);
4852         c->address_families = set_free(c->address_families);
4853
4854         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
4855                 c->directories[t].paths = strv_free(c->directories[t].paths);
4856
4857         c->log_level_max = -1;
4858
4859         exec_context_free_log_extra_fields(c);
4860
4861         c->log_ratelimit_interval_usec = 0;
4862         c->log_ratelimit_burst = 0;
4863
4864         c->stdin_data = mfree(c->stdin_data);
4865         c->stdin_data_size = 0;
4866
4867         c->network_namespace_path = mfree(c->network_namespace_path);
4868
4869         c->log_namespace = mfree(c->log_namespace);
4870
4871         c->load_credentials = strv_free(c->load_credentials);
4872         c->set_credentials = hashmap_free(c->set_credentials);
4873 }
4874
4875 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
4876         char **i;
4877
4878         assert(c);
4879
4880         if (!runtime_prefix)
4881                 return 0;
4882
4883         STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
4884                 _cleanup_free_ char *p;
4885
4886                 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
4887                         p = path_join(runtime_prefix, "private", *i);
4888                 else
4889                         p = path_join(runtime_prefix, *i);
4890                 if (!p)
4891                         return -ENOMEM;
4892
4893                 /* We execute this synchronously, since we need to be sure this is gone when we start the
4894                  * service next. */
4895                 (void) rm_rf(p, REMOVE_ROOT);
4896         }
4897
4898         return 0;
4899 }
4900
4901 int exec_context_destroy_credentials(const ExecContext *c, const char *runtime_prefix, const char *unit) {
4902         _cleanup_free_ char *p = NULL;
4903
4904         assert(c);
4905
4906         if (!runtime_prefix || !unit)
4907                 return 0;
4908
4909         p = path_join(runtime_prefix, "credentials", unit);
4910         if (!p)
4911                 return -ENOMEM;
4912
4913         /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
4914          * unmount it, and afterwards remove the mount point */
4915         (void) umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW);
4916         (void) rm_rf(p, REMOVE_ROOT|REMOVE_CHMOD);
4917
4918         return 0;
4919 }
4920
4921 static void exec_command_done(ExecCommand *c) {
4922         assert(c);
4923
4924         c->path = mfree(c->path);
4925         c->argv = strv_free(c->argv);
4926 }
4927
4928 void exec_command_done_array(ExecCommand *c, size_t n) {
4929         for (size_t i = 0; i < n; i++)
4930                 exec_command_done(c+i);
4931 }
4932
4933 ExecCommand* exec_command_free_list(ExecCommand *c) {
4934         ExecCommand *i;
4935
4936         while ((i = c)) {
4937                 LIST_REMOVE(command, c, i);
4938                 exec_command_done(i);
4939                 free(i);
4940         }
4941
4942         return NULL;
4943 }
4944
4945 void exec_command_free_array(ExecCommand **c, size_t n) {
4946         for (size_t i = 0; i < n; i++)
4947                 c[i] = exec_command_free_list(c[i]);
4948 }
4949
4950 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
4951         for (size_t i = 0; i < n; i++)
4952                 exec_status_reset(&c[i].exec_status);
4953 }
4954
4955 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
4956         for (size_t i = 0; i < n; i++) {
4957                 ExecCommand *z;
4958
4959                 LIST_FOREACH(command, z, c[i])
4960                         exec_status_reset(&z->exec_status);
4961         }
4962 }
4963
4964 typedef struct InvalidEnvInfo {
4965         const Unit *unit;
4966         const char *path;
4967 } InvalidEnvInfo;
4968
4969 static void invalid_env(const char *p, void *userdata) {
4970         InvalidEnvInfo *info = userdata;
4971
4972         log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
4973 }
4974
4975 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
4976         assert(c);
4977
4978         switch (fd_index) {
4979
4980         case STDIN_FILENO:
4981                 if (c->std_input != EXEC_INPUT_NAMED_FD)
4982                         return NULL;
4983
4984                 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
4985
4986         case STDOUT_FILENO:
4987                 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
4988                         return NULL;
4989
4990                 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
4991
4992         case STDERR_FILENO:
4993                 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
4994                         return NULL;
4995
4996                 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
4997
4998         default:
4999                 return NULL;
5000         }
5001 }
5002
5003 static int exec_context_named_iofds(
5004                 const ExecContext *c,
5005                 const ExecParameters *p,
5006                 int named_iofds[static 3]) {
5007
5008         size_t targets;
5009         const char* stdio_fdname[3];
5010         size_t n_fds;
5011
5012         assert(c);
5013         assert(p);
5014         assert(named_iofds);
5015
5016         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
5017                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
5018                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
5019
5020         for (size_t i = 0; i < 3; i++)
5021                 stdio_fdname[i] = exec_context_fdname(c, i);
5022
5023         n_fds = p->n_storage_fds + p->n_socket_fds;
5024
5025         for (size_t i = 0; i < n_fds  && targets > 0; i++)
5026                 if (named_iofds[STDIN_FILENO] < 0 &&
5027                     c->std_input == EXEC_INPUT_NAMED_FD &&
5028                     stdio_fdname[STDIN_FILENO] &&
5029                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
5030
5031                         named_iofds[STDIN_FILENO] = p->fds[i];
5032                         targets--;
5033
5034                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
5035                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
5036                            stdio_fdname[STDOUT_FILENO] &&
5037                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
5038
5039                         named_iofds[STDOUT_FILENO] = p->fds[i];
5040                         targets--;
5041
5042                 } else if (named_iofds[STDERR_FILENO] < 0 &&
5043                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
5044                            stdio_fdname[STDERR_FILENO] &&
5045                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
5046
5047                         named_iofds[STDERR_FILENO] = p->fds[i];
5048                         targets--;
5049                 }
5050
5051         return targets == 0 ? 0 : -ENOENT;
5052 }
5053
5054 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
5055         char **i, **r = NULL;
5056
5057         assert(c);
5058         assert(l);
5059
5060         STRV_FOREACH(i, c->environment_files) {
5061                 char *fn;
5062                 int k;
5063                 bool ignore = false;
5064                 char **p;
5065                 _cleanup_globfree_ glob_t pglob = {};
5066
5067                 fn = *i;
5068
5069                 if (fn[0] == '-') {
5070                         ignore = true;
5071                         fn++;
5072                 }
5073
5074                 if (!path_is_absolute(fn)) {
5075                         if (ignore)
5076                                 continue;
5077
5078                         strv_free(r);
5079                         return -EINVAL;
5080                 }
5081
5082                 /* Filename supports globbing, take all matching files */
5083                 k = safe_glob(fn, 0, &pglob);
5084                 if (k < 0) {
5085                         if (ignore)
5086                                 continue;
5087
5088                         strv_free(r);
5089                         return k;
5090                 }
5091
5092                 /* When we don't match anything, -ENOENT should be returned */
5093                 assert(pglob.gl_pathc > 0);
5094
5095                 for (unsigned n = 0; n < pglob.gl_pathc; n++) {
5096                         k = load_env_file(NULL, pglob.gl_pathv[n], &p);
5097                         if (k < 0) {
5098                                 if (ignore)
5099                                         continue;
5100
5101                                 strv_free(r);
5102                                 return k;
5103                         }
5104                         /* Log invalid environment variables with filename */
5105                         if (p) {
5106                                 InvalidEnvInfo info = {
5107                                         .unit = unit,
5108                                         .path = pglob.gl_pathv[n]
5109                                 };
5110
5111                                 p = strv_env_clean_with_callback(p, invalid_env, &info);
5112                         }
5113
5114                         if (!r)
5115                                 r = p;
5116                         else {
5117                                 char **m;
5118
5119                                 m = strv_env_merge(2, r, p);
5120                                 strv_free(r);
5121                                 strv_free(p);
5122                                 if (!m)
5123                                         return -ENOMEM;
5124
5125                                 r = m;
5126                         }
5127                 }
5128         }
5129
5130         *l = r;
5131
5132         return 0;
5133 }
5134
5135 static bool tty_may_match_dev_console(const char *tty) {
5136         _cleanup_free_ char *resolved = NULL;
5137
5138         if (!tty)
5139                 return true;
5140
5141         tty = skip_dev_prefix(tty);
5142
5143         /* trivial identity? */
5144         if (streq(tty, "console"))
5145                 return true;
5146
5147         if (resolve_dev_console(&resolved) < 0)
5148                 return true; /* if we could not resolve, assume it may */
5149
5150         /* "tty0" means the active VC, so it may be the same sometimes */
5151         return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
5152 }
5153
5154 static bool exec_context_may_touch_tty(const ExecContext *ec) {
5155         assert(ec);
5156
5157         return ec->tty_reset ||
5158                 ec->tty_vhangup ||
5159                 ec->tty_vt_disallocate ||
5160                 is_terminal_input(ec->std_input) ||
5161                 is_terminal_output(ec->std_output) ||
5162                 is_terminal_output(ec->std_error);
5163 }
5164
5165 bool exec_context_may_touch_console(const ExecContext *ec) {
5166
5167         return exec_context_may_touch_tty(ec) &&
5168                tty_may_match_dev_console(exec_context_tty_path(ec));
5169 }
5170
5171 static void strv_fprintf(FILE *f, char **l) {
5172         char **g;
5173
5174         assert(f);
5175
5176         STRV_FOREACH(g, l)
5177                 fprintf(f, " %s", *g);
5178 }
5179
5180 static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
5181         assert(f);
5182         assert(prefix);
5183         assert(name);
5184
5185         if (!strv_isempty(strv)) {
5186                 fprintf(f, "%s%s:", name, prefix);
5187                 strv_fprintf(f, strv);
5188                 fputs("\n", f);
5189         }
5190 }
5191
5192 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
5193         char **e, **d, buf_clean[FORMAT_TIMESPAN_MAX];
5194         int r;
5195
5196         assert(c);
5197         assert(f);
5198
5199         prefix = strempty(prefix);
5200
5201         fprintf(f,
5202                 "%sUMask: %04o\n"
5203                 "%sWorkingDirectory: %s\n"
5204                 "%sRootDirectory: %s\n"
5205                 "%sNonBlocking: %s\n"
5206                 "%sPrivateTmp: %s\n"
5207                 "%sPrivateDevices: %s\n"
5208                 "%sProtectKernelTunables: %s\n"
5209                 "%sProtectKernelModules: %s\n"
5210                 "%sProtectKernelLogs: %s\n"
5211                 "%sProtectClock: %s\n"
5212                 "%sProtectControlGroups: %s\n"
5213                 "%sPrivateNetwork: %s\n"
5214                 "%sPrivateUsers: %s\n"
5215                 "%sProtectHome: %s\n"
5216                 "%sProtectSystem: %s\n"
5217                 "%sMountAPIVFS: %s\n"
5218                 "%sIgnoreSIGPIPE: %s\n"
5219                 "%sMemoryDenyWriteExecute: %s\n"
5220                 "%sRestrictRealtime: %s\n"
5221                 "%sRestrictSUIDSGID: %s\n"
5222                 "%sKeyringMode: %s\n"
5223                 "%sProtectHostname: %s\n"
5224                 "%sProtectProc: %s\n"
5225                 "%sProcSubset: %s\n",
5226                 prefix, c->umask,
5227                 prefix, empty_to_root(c->working_directory),
5228                 prefix, empty_to_root(c->root_directory),
5229                 prefix, yes_no(c->non_blocking),
5230                 prefix, yes_no(c->private_tmp),
5231                 prefix, yes_no(c->private_devices),
5232                 prefix, yes_no(c->protect_kernel_tunables),
5233                 prefix, yes_no(c->protect_kernel_modules),
5234                 prefix, yes_no(c->protect_kernel_logs),
5235                 prefix, yes_no(c->protect_clock),
5236                 prefix, yes_no(c->protect_control_groups),
5237                 prefix, yes_no(c->private_network),
5238                 prefix, yes_no(c->private_users),
5239                 prefix, protect_home_to_string(c->protect_home),
5240                 prefix, protect_system_to_string(c->protect_system),
5241                 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
5242                 prefix, yes_no(c->ignore_sigpipe),
5243                 prefix, yes_no(c->memory_deny_write_execute),
5244                 prefix, yes_no(c->restrict_realtime),
5245                 prefix, yes_no(c->restrict_suid_sgid),
5246                 prefix, exec_keyring_mode_to_string(c->keyring_mode),
5247                 prefix, yes_no(c->protect_hostname),
5248                 prefix, protect_proc_to_string(c->protect_proc),
5249                 prefix, proc_subset_to_string(c->proc_subset));
5250
5251         if (c->root_image)
5252                 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
5253
5254         if (c->root_image_options) {
5255                 MountOptions *o;
5256
5257                 fprintf(f, "%sRootImageOptions:", prefix);
5258                 LIST_FOREACH(mount_options, o, c->root_image_options)
5259                         if (!isempty(o->options))
5260                                 fprintf(f, " %s:%s",
5261                                         partition_designator_to_string(o->partition_designator),
5262                                         o->options);
5263                 fprintf(f, "\n");
5264         }
5265
5266         if (c->root_hash) {
5267                 _cleanup_free_ char *encoded = NULL;
5268                 encoded = hexmem(c->root_hash, c->root_hash_size);
5269                 if (encoded)
5270                         fprintf(f, "%sRootHash: %s\n", prefix, encoded);
5271         }
5272
5273         if (c->root_hash_path)
5274                 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
5275
5276         if (c->root_hash_sig) {
5277                 _cleanup_free_ char *encoded = NULL;
5278                 ssize_t len;
5279                 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
5280                 if (len)
5281                         fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
5282         }
5283
5284         if (c->root_hash_sig_path)
5285                 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
5286
5287         if (c->root_verity)
5288                 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
5289
5290         STRV_FOREACH(e, c->environment)
5291                 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
5292
5293         STRV_FOREACH(e, c->environment_files)
5294                 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
5295
5296         STRV_FOREACH(e, c->pass_environment)
5297                 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
5298
5299         STRV_FOREACH(e, c->unset_environment)
5300                 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
5301
5302         fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
5303
5304         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
5305                 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
5306
5307                 STRV_FOREACH(d, c->directories[dt].paths)
5308                         fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
5309         }
5310
5311         fprintf(f,
5312                 "%sTimeoutCleanSec: %s\n",
5313                 prefix, format_timespan(buf_clean, sizeof(buf_clean), c->timeout_clean_usec, USEC_PER_SEC));
5314
5315         if (c->nice_set)
5316                 fprintf(f,
5317                         "%sNice: %i\n",
5318                         prefix, c->nice);
5319
5320         if (c->oom_score_adjust_set)
5321                 fprintf(f,
5322                         "%sOOMScoreAdjust: %i\n",
5323                         prefix, c->oom_score_adjust);
5324
5325         if (c->coredump_filter_set)
5326                 fprintf(f,
5327                         "%sCoredumpFilter: 0x%"PRIx64"\n",
5328                         prefix, c->coredump_filter);
5329
5330         for (unsigned i = 0; i < RLIM_NLIMITS; i++)
5331                 if (c->rlimit[i]) {
5332                         fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
5333                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
5334                         fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
5335                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
5336                 }
5337
5338         if (c->ioprio_set) {
5339                 _cleanup_free_ char *class_str = NULL;
5340
5341                 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
5342                 if (r >= 0)
5343                         fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
5344
5345                 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
5346         }
5347
5348         if (c->cpu_sched_set) {
5349                 _cleanup_free_ char *policy_str = NULL;
5350
5351                 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
5352                 if (r >= 0)
5353                         fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
5354
5355                 fprintf(f,
5356                         "%sCPUSchedulingPriority: %i\n"
5357                         "%sCPUSchedulingResetOnFork: %s\n",
5358                         prefix, c->cpu_sched_priority,
5359                         prefix, yes_no(c->cpu_sched_reset_on_fork));
5360         }
5361
5362         if (c->cpu_set.set) {
5363                 _cleanup_free_ char *affinity = NULL;
5364
5365                 affinity = cpu_set_to_range_string(&c->cpu_set);
5366                 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
5367         }
5368
5369         if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
5370                 _cleanup_free_ char *nodes = NULL;
5371
5372                 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
5373                 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
5374                 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
5375         }
5376
5377         if (c->timer_slack_nsec != NSEC_INFINITY)
5378                 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
5379
5380         fprintf(f,
5381                 "%sStandardInput: %s\n"
5382                 "%sStandardOutput: %s\n"
5383                 "%sStandardError: %s\n",
5384                 prefix, exec_input_to_string(c->std_input),
5385                 prefix, exec_output_to_string(c->std_output),
5386                 prefix, exec_output_to_string(c->std_error));
5387
5388         if (c->std_input == EXEC_INPUT_NAMED_FD)
5389                 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
5390         if (c->std_output == EXEC_OUTPUT_NAMED_FD)
5391                 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
5392         if (c->std_error == EXEC_OUTPUT_NAMED_FD)
5393                 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
5394
5395         if (c->std_input == EXEC_INPUT_FILE)
5396                 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
5397         if (c->std_output == EXEC_OUTPUT_FILE)
5398                 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5399         if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
5400                 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5401         if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
5402                 fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5403         if (c->std_error == EXEC_OUTPUT_FILE)
5404                 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5405         if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
5406                 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5407         if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
5408                 fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5409
5410         if (c->tty_path)
5411                 fprintf(f,
5412                         "%sTTYPath: %s\n"
5413                         "%sTTYReset: %s\n"
5414                         "%sTTYVHangup: %s\n"
5415                         "%sTTYVTDisallocate: %s\n",
5416                         prefix, c->tty_path,
5417                         prefix, yes_no(c->tty_reset),
5418                         prefix, yes_no(c->tty_vhangup),
5419                         prefix, yes_no(c->tty_vt_disallocate));
5420
5421         if (IN_SET(c->std_output,
5422                    EXEC_OUTPUT_KMSG,
5423                    EXEC_OUTPUT_JOURNAL,
5424                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
5425                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
5426             IN_SET(c->std_error,
5427                    EXEC_OUTPUT_KMSG,
5428                    EXEC_OUTPUT_JOURNAL,
5429                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
5430                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
5431
5432                 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
5433
5434                 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
5435                 if (r >= 0)
5436                         fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
5437
5438                 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
5439                 if (r >= 0)
5440                         fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
5441         }
5442
5443         if (c->log_level_max >= 0) {
5444                 _cleanup_free_ char *t = NULL;
5445
5446                 (void) log_level_to_string_alloc(c->log_level_max, &t);
5447
5448                 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
5449         }
5450
5451         if (c->log_ratelimit_interval_usec > 0) {
5452                 char buf_timespan[FORMAT_TIMESPAN_MAX];
5453
5454                 fprintf(f,
5455                         "%sLogRateLimitIntervalSec: %s\n",
5456                         prefix, format_timespan(buf_timespan, sizeof(buf_timespan), c->log_ratelimit_interval_usec, USEC_PER_SEC));
5457         }
5458
5459         if (c->log_ratelimit_burst > 0)
5460                 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
5461
5462         for (size_t j = 0; j < c->n_log_extra_fields; j++) {
5463                 fprintf(f, "%sLogExtraFields: ", prefix);
5464                 fwrite(c->log_extra_fields[j].iov_base,
5465                        1, c->log_extra_fields[j].iov_len,
5466                        f);
5467                 fputc('\n', f);
5468         }
5469
5470         if (c->log_namespace)
5471                 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
5472
5473         if (c->secure_bits) {
5474                 _cleanup_free_ char *str = NULL;
5475
5476                 r = secure_bits_to_string_alloc(c->secure_bits, &str);
5477                 if (r >= 0)
5478                         fprintf(f, "%sSecure Bits: %s\n", prefix, str);
5479         }
5480
5481         if (c->capability_bounding_set != CAP_ALL) {
5482                 _cleanup_free_ char *str = NULL;
5483
5484                 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
5485                 if (r >= 0)
5486                         fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
5487         }
5488
5489         if (c->capability_ambient_set != 0) {
5490                 _cleanup_free_ char *str = NULL;
5491
5492                 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
5493                 if (r >= 0)
5494                         fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
5495         }
5496
5497         if (c->user)
5498                 fprintf(f, "%sUser: %s\n", prefix, c->user);
5499         if (c->group)
5500                 fprintf(f, "%sGroup: %s\n", prefix, c->group);
5501
5502         fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
5503
5504         strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
5505
5506         if (c->pam_name)
5507                 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
5508
5509         strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
5510         strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
5511         strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
5512         strv_dump(f, prefix, "ExecPaths", c->exec_paths);
5513         strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
5514
5515         for (size_t i = 0; i < c->n_bind_mounts; i++)
5516                 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
5517                         c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
5518                         c->bind_mounts[i].ignore_enoent ? "-": "",
5519                         c->bind_mounts[i].source,
5520                         c->bind_mounts[i].destination,
5521                         c->bind_mounts[i].recursive ? "rbind" : "norbind");
5522
5523         for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
5524                 const TemporaryFileSystem *t = c->temporary_filesystems + i;
5525
5526                 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
5527                         t->path,
5528                         isempty(t->options) ? "" : ":",
5529                         strempty(t->options));
5530         }
5531
5532         if (c->utmp_id)
5533                 fprintf(f,
5534                         "%sUtmpIdentifier: %s\n",
5535                         prefix, c->utmp_id);
5536
5537         if (c->selinux_context)
5538                 fprintf(f,
5539                         "%sSELinuxContext: %s%s\n",
5540                         prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
5541
5542         if (c->apparmor_profile)
5543                 fprintf(f,
5544                         "%sAppArmorProfile: %s%s\n",
5545                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
5546
5547         if (c->smack_process_label)
5548                 fprintf(f,
5549                         "%sSmackProcessLabel: %s%s\n",
5550                         prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
5551
5552         if (c->personality != PERSONALITY_INVALID)
5553                 fprintf(f,
5554                         "%sPersonality: %s\n",
5555                         prefix, strna(personality_to_string(c->personality)));
5556
5557         fprintf(f,
5558                 "%sLockPersonality: %s\n",
5559                 prefix, yes_no(c->lock_personality));
5560
5561         if (c->syscall_filter) {
5562 #if HAVE_SECCOMP
5563                 void *id, *val;
5564                 bool first = true;
5565 #endif
5566
5567                 fprintf(f,
5568                         "%sSystemCallFilter: ",
5569                         prefix);
5570
5571                 if (!c->syscall_allow_list)
5572                         fputc('~', f);
5573
5574 #if HAVE_SECCOMP
5575                 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
5576                         _cleanup_free_ char *name = NULL;
5577                         const char *errno_name = NULL;
5578                         int num = PTR_TO_INT(val);
5579
5580                         if (first)
5581                                 first = false;
5582                         else
5583                                 fputc(' ', f);
5584
5585                         name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
5586                         fputs(strna(name), f);
5587
5588                         if (num >= 0) {
5589                                 errno_name = seccomp_errno_or_action_to_string(num);
5590                                 if (errno_name)
5591                                         fprintf(f, ":%s", errno_name);
5592                                 else
5593                                         fprintf(f, ":%d", num);
5594                         }
5595                 }
5596 #endif
5597
5598                 fputc('\n', f);
5599         }
5600
5601         if (c->syscall_archs) {
5602 #if HAVE_SECCOMP
5603                 void *id;
5604 #endif
5605
5606                 fprintf(f,
5607                         "%sSystemCallArchitectures:",
5608                         prefix);
5609
5610 #if HAVE_SECCOMP
5611                 SET_FOREACH(id, c->syscall_archs)
5612                         fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
5613 #endif
5614                 fputc('\n', f);
5615         }
5616
5617         if (exec_context_restrict_namespaces_set(c)) {
5618                 _cleanup_free_ char *s = NULL;
5619
5620                 r = namespace_flags_to_string(c->restrict_namespaces, &s);
5621                 if (r >= 0)
5622                         fprintf(f, "%sRestrictNamespaces: %s\n",
5623                                 prefix, strna(s));
5624         }
5625
5626         if (c->network_namespace_path)
5627                 fprintf(f,
5628                         "%sNetworkNamespacePath: %s\n",
5629                         prefix, c->network_namespace_path);
5630
5631         if (c->syscall_errno > 0) {
5632 #if HAVE_SECCOMP
5633                 const char *errno_name;
5634 #endif
5635
5636                 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
5637
5638 #if HAVE_SECCOMP
5639                 errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
5640                 if (errno_name)
5641                         fputs(errno_name, f);
5642                 else
5643                         fprintf(f, "%d", c->syscall_errno);
5644 #endif
5645                 fputc('\n', f);
5646         }
5647
5648         for (size_t i = 0; i < c->n_mount_images; i++) {
5649                 MountOptions *o;
5650
5651                 fprintf(f, "%sMountImages: %s%s:%s%s", prefix,
5652                         c->mount_images[i].ignore_enoent ? "-": "",
5653                         c->mount_images[i].source,
5654                         c->mount_images[i].destination,
5655                         LIST_IS_EMPTY(c->mount_images[i].mount_options) ? "": ":");
5656                 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
5657                         fprintf(f, "%s:%s",
5658                                 partition_designator_to_string(o->partition_designator),
5659                                 o->options);
5660                 fprintf(f, "\n");
5661         }
5662 }
5663
5664 bool exec_context_maintains_privileges(const ExecContext *c) {
5665         assert(c);
5666
5667         /* Returns true if the process forked off would run under
5668          * an unchanged UID or as root. */
5669
5670         if (!c->user)
5671                 return true;
5672
5673         if (streq(c->user, "root") || streq(c->user, "0"))
5674                 return true;
5675
5676         return false;
5677 }
5678
5679 int exec_context_get_effective_ioprio(const ExecContext *c) {
5680         int p;
5681
5682         assert(c);
5683
5684         if (c->ioprio_set)
5685                 return c->ioprio;
5686
5687         p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
5688         if (p < 0)
5689                 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
5690
5691         return p;
5692 }
5693
5694 bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
5695         assert(c);
5696
5697         /* Explicit setting wins */
5698         if (c->mount_apivfs_set)
5699                 return c->mount_apivfs;
5700
5701         /* Default to "yes" if root directory or image are specified */
5702         if (exec_context_with_rootfs(c))
5703                 return true;
5704
5705         return false;
5706 }
5707
5708 void exec_context_free_log_extra_fields(ExecContext *c) {
5709         assert(c);
5710
5711         for (size_t l = 0; l < c->n_log_extra_fields; l++)
5712                 free(c->log_extra_fields[l].iov_base);
5713         c->log_extra_fields = mfree(c->log_extra_fields);
5714         c->n_log_extra_fields = 0;
5715 }
5716
5717 void exec_context_revert_tty(ExecContext *c) {
5718         int r;
5719
5720         assert(c);
5721
5722         /* First, reset the TTY (possibly kicking everybody else from the TTY) */
5723         exec_context_tty_reset(c, NULL);
5724
5725         /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
5726          * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
5727          * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
5728
5729         if (exec_context_may_touch_tty(c)) {
5730                 const char *path;
5731
5732                 path = exec_context_tty_path(c);
5733                 if (path) {
5734                         r = chmod_and_chown(path, TTY_MODE, 0, TTY_GID);
5735                         if (r < 0 && r != -ENOENT)
5736                                 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
5737                 }
5738         }
5739 }
5740
5741 int exec_context_get_clean_directories(
5742                 ExecContext *c,
5743                 char **prefix,
5744                 ExecCleanMask mask,
5745                 char ***ret) {
5746
5747         _cleanup_strv_free_ char **l = NULL;
5748         int r;
5749
5750         assert(c);
5751         assert(prefix);
5752         assert(ret);
5753
5754         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
5755                 char **i;
5756
5757                 if (!FLAGS_SET(mask, 1U << t))
5758                         continue;
5759
5760                 if (!prefix[t])
5761                         continue;
5762
5763                 STRV_FOREACH(i, c->directories[t].paths) {
5764                         char *j;
5765
5766                         j = path_join(prefix[t], *i);
5767                         if (!j)
5768                                 return -ENOMEM;
5769
5770                         r = strv_consume(&l, j);
5771                         if (r < 0)
5772                                 return r;
5773
5774                         /* Also remove private directories unconditionally. */
5775                         if (t != EXEC_DIRECTORY_CONFIGURATION) {
5776                                 j = path_join(prefix[t], "private", *i);
5777                                 if (!j)
5778                                         return -ENOMEM;
5779
5780                                 r = strv_consume(&l, j);
5781                                 if (r < 0)
5782                                         return r;
5783                         }
5784                 }
5785         }
5786
5787         *ret = TAKE_PTR(l);
5788         return 0;
5789 }
5790
5791 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
5792         ExecCleanMask mask = 0;
5793
5794         assert(c);
5795         assert(ret);
5796
5797         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5798                 if (!strv_isempty(c->directories[t].paths))
5799                         mask |= 1U << t;
5800
5801         *ret = mask;
5802         return 0;
5803 }
5804
5805 void exec_status_start(ExecStatus *s, pid_t pid) {
5806         assert(s);
5807
5808         *s = (ExecStatus) {
5809                 .pid = pid,
5810         };
5811
5812         dual_timestamp_get(&s->start_timestamp);
5813 }
5814
5815 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
5816         assert(s);
5817
5818         if (s->pid != pid)
5819                 *s = (ExecStatus) {
5820                         .pid = pid,
5821                 };
5822
5823         dual_timestamp_get(&s->exit_timestamp);
5824
5825         s->code = code;
5826         s->status = status;
5827
5828         if (context && context->utmp_id)
5829                 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
5830 }
5831
5832 void exec_status_reset(ExecStatus *s) {
5833         assert(s);
5834
5835         *s = (ExecStatus) {};
5836 }
5837
5838 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
5839         char buf[FORMAT_TIMESTAMP_MAX];
5840
5841         assert(s);
5842         assert(f);
5843
5844         if (s->pid <= 0)
5845                 return;
5846
5847         prefix = strempty(prefix);
5848
5849         fprintf(f,
5850                 "%sPID: "PID_FMT"\n",
5851                 prefix, s->pid);
5852
5853         if (dual_timestamp_is_set(&s->start_timestamp))
5854                 fprintf(f,
5855                         "%sStart Timestamp: %s\n",
5856                         prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
5857
5858         if (dual_timestamp_is_set(&s->exit_timestamp))
5859                 fprintf(f,
5860                         "%sExit Timestamp: %s\n"
5861                         "%sExit Code: %s\n"
5862                         "%sExit Status: %i\n",
5863                         prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
5864                         prefix, sigchld_code_to_string(s->code),
5865                         prefix, s->status);
5866 }
5867
5868 static char *exec_command_line(char **argv) {
5869         size_t k;
5870         char *n, *p, **a;
5871         bool first = true;
5872
5873         assert(argv);
5874
5875         k = 1;
5876         STRV_FOREACH(a, argv)
5877                 k += strlen(*a)+3;
5878
5879         n = new(char, k);
5880         if (!n)
5881                 return NULL;
5882
5883         p = n;
5884         STRV_FOREACH(a, argv) {
5885
5886                 if (!first)
5887                         *(p++) = ' ';
5888                 else
5889                         first = false;
5890
5891                 if (strpbrk(*a, WHITESPACE)) {
5892                         *(p++) = '\'';
5893                         p = stpcpy(p, *a);
5894                         *(p++) = '\'';
5895                 } else
5896                         p = stpcpy(p, *a);
5897
5898         }
5899
5900         *p = 0;
5901
5902         /* FIXME: this doesn't really handle arguments that have
5903          * spaces and ticks in them */
5904
5905         return n;
5906 }
5907
5908 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
5909         _cleanup_free_ char *cmd = NULL;
5910         const char *prefix2;
5911
5912         assert(c);
5913         assert(f);
5914
5915         prefix = strempty(prefix);
5916         prefix2 = strjoina(prefix, "\t");
5917
5918         cmd = exec_command_line(c->argv);
5919         fprintf(f,
5920                 "%sCommand Line: %s\n",
5921                 prefix, cmd ? cmd : strerror_safe(ENOMEM));
5922
5923         exec_status_dump(&c->exec_status, f, prefix2);
5924 }
5925
5926 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
5927         assert(f);
5928
5929         prefix = strempty(prefix);
5930
5931         LIST_FOREACH(command, c, c)
5932                 exec_command_dump(c, f, prefix);
5933 }
5934
5935 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
5936         ExecCommand *end;
5937
5938         assert(l);
5939         assert(e);
5940
5941         if (*l) {
5942                 /* It's kind of important, that we keep the order here */
5943                 LIST_FIND_TAIL(command, *l, end);
5944                 LIST_INSERT_AFTER(command, *l, end, e);
5945         } else
5946               *l = e;
5947 }
5948
5949 int exec_command_set(ExecCommand *c, const char *path, ...) {
5950         va_list ap;
5951         char **l, *p;
5952
5953         assert(c);
5954         assert(path);
5955
5956         va_start(ap, path);
5957         l = strv_new_ap(path, ap);
5958         va_end(ap);
5959
5960         if (!l)
5961                 return -ENOMEM;
5962
5963         p = strdup(path);
5964         if (!p) {
5965                 strv_free(l);
5966                 return -ENOMEM;
5967         }
5968
5969         free_and_replace(c->path, p);
5970
5971         return strv_free_and_replace(c->argv, l);
5972 }
5973
5974 int exec_command_append(ExecCommand *c, const char *path, ...) {
5975         _cleanup_strv_free_ char **l = NULL;
5976         va_list ap;
5977         int r;
5978
5979         assert(c);
5980         assert(path);
5981
5982         va_start(ap, path);
5983         l = strv_new_ap(path, ap);
5984         va_end(ap);
5985
5986         if (!l)
5987                 return -ENOMEM;
5988
5989         r = strv_extend_strv(&c->argv, l, false);
5990         if (r < 0)
5991                 return r;
5992
5993         return 0;
5994 }
5995
5996 static void *remove_tmpdir_thread(void *p) {
5997         _cleanup_free_ char *path = p;
5998
5999         (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
6000         return NULL;
6001 }
6002
6003 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
6004         int r;
6005
6006         if (!rt)
6007                 return NULL;
6008
6009         if (rt->manager)
6010                 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
6011
6012         /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
6013
6014         if (destroy && rt->tmp_dir && !streq(rt->tmp_dir, RUN_SYSTEMD_EMPTY)) {
6015                 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
6016
6017                 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
6018                 if (r < 0)
6019                         log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
6020                 else
6021                         rt->tmp_dir = NULL;
6022         }
6023
6024         if (destroy && rt->var_tmp_dir && !streq(rt->var_tmp_dir, RUN_SYSTEMD_EMPTY)) {
6025                 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
6026
6027                 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
6028                 if (r < 0)
6029                         log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
6030                 else
6031                         rt->var_tmp_dir = NULL;
6032         }
6033
6034         rt->id = mfree(rt->id);
6035         rt->tmp_dir = mfree(rt->tmp_dir);
6036         rt->var_tmp_dir = mfree(rt->var_tmp_dir);
6037         safe_close_pair(rt->netns_storage_socket);
6038         return mfree(rt);
6039 }
6040
6041 static void exec_runtime_freep(ExecRuntime **rt) {
6042         (void) exec_runtime_free(*rt, false);
6043 }
6044
6045 static int exec_runtime_allocate(ExecRuntime **ret, const char *id) {
6046         _cleanup_free_ char *id_copy = NULL;
6047         ExecRuntime *n;
6048
6049         assert(ret);
6050
6051         id_copy = strdup(id);
6052         if (!id_copy)
6053                 return -ENOMEM;
6054
6055         n = new(ExecRuntime, 1);
6056         if (!n)
6057                 return -ENOMEM;
6058
6059         *n = (ExecRuntime) {
6060                 .id = TAKE_PTR(id_copy),
6061                 .netns_storage_socket = { -1, -1 },
6062         };
6063
6064         *ret = n;
6065         return 0;
6066 }
6067
6068 static int exec_runtime_add(
6069                 Manager *m,
6070                 const char *id,
6071                 char **tmp_dir,
6072                 char **var_tmp_dir,
6073                 int netns_storage_socket[2],
6074                 ExecRuntime **ret) {
6075
6076         _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
6077         int r;
6078
6079         assert(m);
6080         assert(id);
6081
6082         /* tmp_dir, var_tmp_dir, netns_storage_socket fds are donated on success */
6083
6084         r = exec_runtime_allocate(&rt, id);
6085         if (r < 0)
6086                 return r;
6087
6088         r = hashmap_ensure_put(&m->exec_runtime_by_id, &string_hash_ops, rt->id, rt);
6089         if (r < 0)
6090                 return r;
6091
6092         assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
6093         rt->tmp_dir = TAKE_PTR(*tmp_dir);
6094         rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
6095
6096         if (netns_storage_socket) {
6097                 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
6098                 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
6099         }
6100
6101         rt->manager = m;
6102
6103         if (ret)
6104                 *ret = rt;
6105         /* do not remove created ExecRuntime object when the operation succeeds. */
6106         TAKE_PTR(rt);
6107         return 0;
6108 }
6109
6110 static int exec_runtime_make(
6111                 Manager *m,
6112                 const ExecContext *c,
6113                 const char *id,
6114                 ExecRuntime **ret) {
6115
6116         _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
6117         _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 };
6118         int r;
6119
6120         assert(m);
6121         assert(c);
6122         assert(id);
6123
6124         /* It is not necessary to create ExecRuntime object. */
6125         if (!c->private_network && !c->private_tmp && !c->network_namespace_path) {
6126                 *ret = NULL;
6127                 return 0;
6128         }
6129
6130         if (c->private_tmp &&
6131             !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
6132               (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
6133                prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
6134                 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
6135                 if (r < 0)
6136                         return r;
6137         }
6138
6139         if (c->private_network || c->network_namespace_path) {
6140                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
6141                         return -errno;
6142         }
6143
6144         r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ret);
6145         if (r < 0)
6146                 return r;
6147
6148         return 1;
6149 }
6150
6151 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
6152         ExecRuntime *rt;
6153         int r;
6154
6155         assert(m);
6156         assert(id);
6157         assert(ret);
6158
6159         rt = hashmap_get(m->exec_runtime_by_id, id);
6160         if (rt)
6161                 /* We already have a ExecRuntime object, let's increase the ref count and reuse it */
6162                 goto ref;
6163
6164         if (!create) {
6165                 *ret = NULL;
6166                 return 0;
6167         }
6168
6169         /* If not found, then create a new object. */
6170         r = exec_runtime_make(m, c, id, &rt);
6171         if (r < 0)
6172                 return r;
6173         if (r == 0) {
6174                 /* When r == 0, it is not necessary to create ExecRuntime object. */
6175                 *ret = NULL;
6176                 return 0;
6177         }
6178
6179 ref:
6180         /* increment reference counter. */
6181         rt->n_ref++;
6182         *ret = rt;
6183         return 1;
6184 }
6185
6186 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
6187         if (!rt)
6188                 return NULL;
6189
6190         assert(rt->n_ref > 0);
6191
6192         rt->n_ref--;
6193         if (rt->n_ref > 0)
6194                 return NULL;
6195
6196         return exec_runtime_free(rt, destroy);
6197 }
6198
6199 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
6200         ExecRuntime *rt;
6201
6202         assert(m);
6203         assert(f);
6204         assert(fds);
6205
6206         HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
6207                 fprintf(f, "exec-runtime=%s", rt->id);
6208
6209                 if (rt->tmp_dir)
6210                         fprintf(f, " tmp-dir=%s", rt->tmp_dir);
6211
6212                 if (rt->var_tmp_dir)
6213                         fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
6214
6215                 if (rt->netns_storage_socket[0] >= 0) {
6216                         int copy;
6217
6218                         copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
6219                         if (copy < 0)
6220                                 return copy;
6221
6222                         fprintf(f, " netns-socket-0=%i", copy);
6223                 }
6224
6225                 if (rt->netns_storage_socket[1] >= 0) {
6226                         int copy;
6227
6228                         copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
6229                         if (copy < 0)
6230                                 return copy;
6231
6232                         fprintf(f, " netns-socket-1=%i", copy);
6233                 }
6234
6235                 fputc('\n', f);
6236         }
6237
6238         return 0;
6239 }
6240
6241 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
6242         _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
6243         ExecRuntime *rt;
6244         int r;
6245
6246         /* This is for the migration from old (v237 or earlier) deserialization text.
6247          * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
6248          * Even if the ExecRuntime object originally created by the other unit, we cannot judge
6249          * so or not from the serialized text, then we always creates a new object owned by this. */
6250
6251         assert(u);
6252         assert(key);
6253         assert(value);
6254
6255         /* Manager manages ExecRuntime objects by the unit id.
6256          * So, we omit the serialized text when the unit does not have id (yet?)... */
6257         if (isempty(u->id)) {
6258                 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
6259                 return 0;
6260         }
6261
6262         r = hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops);
6263         if (r < 0) {
6264                 log_unit_debug_errno(u, r, "Failed to allocate storage for runtime parameter: %m");
6265                 return 0;
6266         }
6267
6268         rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
6269         if (!rt) {
6270                 r = exec_runtime_allocate(&rt_create, u->id);
6271                 if (r < 0)
6272                         return log_oom();
6273
6274                 rt = rt_create;
6275         }
6276
6277         if (streq(key, "tmp-dir")) {
6278                 char *copy;
6279
6280                 copy = strdup(value);
6281                 if (!copy)
6282                         return log_oom();
6283
6284                 free_and_replace(rt->tmp_dir, copy);
6285
6286         } else if (streq(key, "var-tmp-dir")) {
6287                 char *copy;
6288
6289                 copy = strdup(value);
6290                 if (!copy)
6291                         return log_oom();
6292
6293                 free_and_replace(rt->var_tmp_dir, copy);
6294
6295         } else if (streq(key, "netns-socket-0")) {
6296                 int fd;
6297
6298                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
6299                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
6300                         return 0;
6301                 }
6302
6303                 safe_close(rt->netns_storage_socket[0]);
6304                 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
6305
6306         } else if (streq(key, "netns-socket-1")) {
6307                 int fd;
6308
6309                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
6310                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
6311                         return 0;
6312                 }
6313
6314                 safe_close(rt->netns_storage_socket[1]);
6315                 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
6316         } else
6317                 return 0;
6318
6319         /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
6320         if (rt_create) {
6321                 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
6322                 if (r < 0) {
6323                         log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
6324                         return 0;
6325                 }
6326
6327                 rt_create->manager = u->manager;
6328
6329                 /* Avoid cleanup */
6330                 TAKE_PTR(rt_create);
6331         }
6332
6333         return 1;
6334 }
6335
6336 int exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
6337         _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
6338         char *id = NULL;
6339         int r, fdpair[] = {-1, -1};
6340         const char *p, *v = value;
6341         size_t n;
6342
6343         assert(m);
6344         assert(value);
6345         assert(fds);
6346
6347         n = strcspn(v, " ");
6348         id = strndupa(v, n);
6349         if (v[n] != ' ')
6350                 goto finalize;
6351         p = v + n + 1;
6352
6353         v = startswith(p, "tmp-dir=");
6354         if (v) {
6355                 n = strcspn(v, " ");
6356                 tmp_dir = strndup(v, n);
6357                 if (!tmp_dir)
6358                         return log_oom();
6359                 if (v[n] != ' ')
6360                         goto finalize;
6361                 p = v + n + 1;
6362         }
6363
6364         v = startswith(p, "var-tmp-dir=");
6365         if (v) {
6366                 n = strcspn(v, " ");
6367                 var_tmp_dir = strndup(v, n);
6368                 if (!var_tmp_dir)
6369                         return log_oom();
6370                 if (v[n] != ' ')
6371                         goto finalize;
6372                 p = v + n + 1;
6373         }
6374
6375         v = startswith(p, "netns-socket-0=");
6376         if (v) {
6377                 char *buf;
6378
6379                 n = strcspn(v, " ");
6380                 buf = strndupa(v, n);
6381
6382                 r = safe_atoi(buf, &fdpair[0]);
6383                 if (r < 0)
6384                         return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
6385                 if (!fdset_contains(fds, fdpair[0]))
6386                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6387                                                "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", fdpair[0]);
6388                 fdpair[0] = fdset_remove(fds, fdpair[0]);
6389                 if (v[n] != ' ')
6390                         goto finalize;
6391                 p = v + n + 1;
6392         }
6393
6394         v = startswith(p, "netns-socket-1=");
6395         if (v) {
6396                 char *buf;
6397
6398                 n = strcspn(v, " ");
6399                 buf = strndupa(v, n);
6400                 r = safe_atoi(buf, &fdpair[1]);
6401                 if (r < 0)
6402                         return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
6403                 if (!fdset_contains(fds, fdpair[1]))
6404                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6405                                                "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", fdpair[1]);
6406                 fdpair[1] = fdset_remove(fds, fdpair[1]);
6407         }
6408
6409 finalize:
6410         r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, fdpair, NULL);
6411         if (r < 0)
6412                 return log_debug_errno(r, "Failed to add exec-runtime: %m");
6413         return 0;
6414 }
6415
6416 void exec_runtime_vacuum(Manager *m) {
6417         ExecRuntime *rt;
6418
6419         assert(m);
6420
6421         /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
6422
6423         HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
6424                 if (rt->n_ref > 0)
6425                         continue;
6426
6427                 (void) exec_runtime_free(rt, false);
6428         }
6429 }
6430
6431 void exec_params_clear(ExecParameters *p) {
6432         if (!p)
6433                 return;
6434
6435         p->environment = strv_free(p->environment);
6436         p->fd_names = strv_free(p->fd_names);
6437         p->fds = mfree(p->fds);
6438         p->exec_fd = safe_close(p->exec_fd);
6439 }
6440
6441 ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) {
6442         if (!sc)
6443                 return NULL;
6444
6445         free(sc->id);
6446         free(sc->data);
6447         return mfree(sc);
6448 }
6449
6450 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_set_credential_hash_ops, char, string_hash_func, string_compare_func, ExecSetCredential, exec_set_credential_free);
6451
6452 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
6453         [EXEC_INPUT_NULL] = "null",
6454         [EXEC_INPUT_TTY] = "tty",
6455         [EXEC_INPUT_TTY_FORCE] = "tty-force",
6456         [EXEC_INPUT_TTY_FAIL] = "tty-fail",
6457         [EXEC_INPUT_SOCKET] = "socket",
6458         [EXEC_INPUT_NAMED_FD] = "fd",
6459         [EXEC_INPUT_DATA] = "data",
6460         [EXEC_INPUT_FILE] = "file",
6461 };
6462
6463 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
6464
6465 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
6466         [EXEC_OUTPUT_INHERIT] = "inherit",
6467         [EXEC_OUTPUT_NULL] = "null",
6468         [EXEC_OUTPUT_TTY] = "tty",
6469         [EXEC_OUTPUT_KMSG] = "kmsg",
6470         [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
6471         [EXEC_OUTPUT_JOURNAL] = "journal",
6472         [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
6473         [EXEC_OUTPUT_SOCKET] = "socket",
6474         [EXEC_OUTPUT_NAMED_FD] = "fd",
6475         [EXEC_OUTPUT_FILE] = "file",
6476         [EXEC_OUTPUT_FILE_APPEND] = "append",
6477         [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
6478 };
6479
6480 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
6481
6482 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
6483         [EXEC_UTMP_INIT] = "init",
6484         [EXEC_UTMP_LOGIN] = "login",
6485         [EXEC_UTMP_USER] = "user",
6486 };
6487
6488 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
6489
6490 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
6491         [EXEC_PRESERVE_NO] = "no",
6492         [EXEC_PRESERVE_YES] = "yes",
6493         [EXEC_PRESERVE_RESTART] = "restart",
6494 };
6495
6496 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
6497
6498 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
6499 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
6500         [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
6501         [EXEC_DIRECTORY_STATE] = "StateDirectory",
6502         [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
6503         [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
6504         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
6505 };
6506
6507 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
6508
6509 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
6510  * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
6511  * directories, specifically .timer units with their timestamp touch file. */
6512 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
6513         [EXEC_DIRECTORY_RUNTIME] = "runtime",
6514         [EXEC_DIRECTORY_STATE] = "state",
6515         [EXEC_DIRECTORY_CACHE] = "cache",
6516         [EXEC_DIRECTORY_LOGS] = "logs",
6517         [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
6518 };
6519
6520 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
6521
6522 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
6523  * the service payload in. */
6524 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
6525         [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
6526         [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
6527         [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
6528         [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
6529         [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
6530 };
6531
6532 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
6533
6534 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
6535         [EXEC_KEYRING_INHERIT] = "inherit",
6536         [EXEC_KEYRING_PRIVATE] = "private",
6537         [EXEC_KEYRING_SHARED] = "shared",
6538 };
6539
6540 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);