src/core/execute.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <errno.h>
   4 #include <fcntl.h>
   5 #include <poll.h>
   6 #include <sys/eventfd.h>
   7 #include <sys/ioctl.h>
   8 #include <sys/mman.h>
   9 #include <sys/mount.h>
  10 #include <sys/personality.h>
  11 #include <sys/prctl.h>
  12 #include <sys/shm.h>
  13 #include <sys/types.h>
  14 #include <sys/un.h>
  15 #include <unistd.h>
  16 #include <utmpx.h>
  17
  18 #if HAVE_PAM
  19 #include <security/pam_appl.h>
  20 #endif
  21
  22 #if HAVE_SELINUX
  23 #include <selinux/selinux.h>
  24 #endif
  25
  26 #if HAVE_SECCOMP
  27 #include <seccomp.h>
  28 #endif
  29
  30 #if HAVE_APPARMOR
  31 #include <sys/apparmor.h>
  32 #endif
  33
  34 #include "sd-messages.h"
  35
  36 #include "acl-util.h"
  37 #include "af-list.h"
  38 #include "alloc-util.h"
  39 #if HAVE_APPARMOR
  40 #include "apparmor-util.h"
  41 #endif
  42 #include "async.h"
  43 #include "barrier.h"
  44 #include "cap-list.h"
  45 #include "capability-util.h"
  46 #include "cgroup-setup.h"
  47 #include "chown-recursive.h"
  48 #include "cpu-set-util.h"
  49 #include "def.h"
  50 #include "env-file.h"
  51 #include "env-util.h"
  52 #include "errno-list.h"
  53 #include "execute.h"
  54 #include "exit-status.h"
  55 #include "fd-util.h"
  56 #include "fileio.h"
  57 #include "format-util.h"
  58 #include "fs-util.h"
  59 #include "glob-util.h"
  60 #include "hexdecoct.h"
  61 #include "io-util.h"
  62 #include "ioprio.h"
  63 #include "label.h"
  64 #include "log.h"
  65 #include "macro.h"
  66 #include "manager.h"
  67 #include "memory-util.h"
  68 #include "missing_fs.h"
  69 #include "mkdir.h"
  70 #include "mount-util.h"
  71 #include "mountpoint-util.h"
  72 #include "namespace.h"
  73 #include "parse-util.h"
  74 #include "path-util.h"
  75 #include "process-util.h"
  76 #include "random-util.h"
  77 #include "rlimit-util.h"
  78 #include "rm-rf.h"
  79 #if HAVE_SECCOMP
  80 #include "seccomp-util.h"
  81 #endif
  82 #include "securebits-util.h"
  83 #include "selinux-util.h"
  84 #include "signal-util.h"
  85 #include "smack-util.h"
  86 #include "socket-util.h"
  87 #include "special.h"
  88 #include "stat-util.h"
  89 #include "string-table.h"
  90 #include "string-util.h"
  91 #include "strv.h"
  92 #include "syslog-util.h"
  93 #include "terminal-util.h"
  94 #include "tmpfile-util.h"
  95 #include "umask-util.h"
  96 #include "unit.h"
  97 #include "user-util.h"
  98 #include "utmp-wtmp.h"
  99
 100 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
 101 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
 102
 103 #define SNDBUF_SIZE (8*1024*1024)
 104
 105 static int shift_fds(int fds[], size_t n_fds) {
 106         if (n_fds <= 0)
 107                 return 0;
 108
 109         /* Modifies the fds array! (sorts it) */
 110
 111         assert(fds);
 112
 113         for (int start = 0;;) {
 114                 int restart_from = -1;
 115
 116                 for (int i = start; i < (int) n_fds; i++) {
 117                         int nfd;
 118
 119                         /* Already at right index? */
 120                         if (fds[i] == i+3)
 121                                 continue;
 122
 123                         nfd = fcntl(fds[i], F_DUPFD, i + 3);
 124                         if (nfd < 0)
 125                                 return -errno;
 126
 127                         safe_close(fds[i]);
 128                         fds[i] = nfd;
 129
 130                         /* Hmm, the fd we wanted isn't free? Then
 131                          * let's remember that and try again from here */
 132                         if (nfd != i+3 && restart_from < 0)
 133                                 restart_from = i;
 134                 }
 135
 136                 if (restart_from < 0)
 137                         break;
 138
 139                 start = restart_from;
 140         }
 141
 142         return 0;
 143 }
 144
 145 static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
 146         size_t n_fds;
 147         int r;
 148
 149         n_fds = n_socket_fds + n_storage_fds;
 150         if (n_fds <= 0)
 151                 return 0;
 152
 153         assert(fds);
 154
 155         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
 156          * O_NONBLOCK only applies to socket activation though. */
 157
 158         for (size_t i = 0; i < n_fds; i++) {
 159
 160                 if (i < n_socket_fds) {
 161                         r = fd_nonblock(fds[i], nonblock);
 162                         if (r < 0)
 163                                 return r;
 164                 }
 165
 166                 /* We unconditionally drop FD_CLOEXEC from the fds,
 167                  * since after all we want to pass these fds to our
 168                  * children */
 169
 170                 r = fd_cloexec(fds[i], false);
 171                 if (r < 0)
 172                         return r;
 173         }
 174
 175         return 0;
 176 }
 177
 178 static const char *exec_context_tty_path(const ExecContext *context) {
 179         assert(context);
 180
 181         if (context->stdio_as_fds)
 182                 return NULL;
 183
 184         if (context->tty_path)
 185                 return context->tty_path;
 186
 187         return "/dev/console";
 188 }
 189
 190 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
 191         const char *path;
 192
 193         assert(context);
 194
 195         path = exec_context_tty_path(context);
 196
 197         if (context->tty_vhangup) {
 198                 if (p && p->stdin_fd >= 0)
 199                         (void) terminal_vhangup_fd(p->stdin_fd);
 200                 else if (path)
 201                         (void) terminal_vhangup(path);
 202         }
 203
 204         if (context->tty_reset) {
 205                 if (p && p->stdin_fd >= 0)
 206                         (void) reset_terminal_fd(p->stdin_fd, true);
 207                 else if (path)
 208                         (void) reset_terminal(path);
 209         }
 210
 211         if (context->tty_vt_disallocate && path)
 212                 (void) vt_disallocate(path);
 213 }
 214
 215 static bool is_terminal_input(ExecInput i) {
 216         return IN_SET(i,
 217                       EXEC_INPUT_TTY,
 218                       EXEC_INPUT_TTY_FORCE,
 219                       EXEC_INPUT_TTY_FAIL);
 220 }
 221
 222 static bool is_terminal_output(ExecOutput o) {
 223         return IN_SET(o,
 224                       EXEC_OUTPUT_TTY,
 225                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 226                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 227 }
 228
 229 static bool is_kmsg_output(ExecOutput o) {
 230         return IN_SET(o,
 231                       EXEC_OUTPUT_KMSG,
 232                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 233 }
 234
 235 static bool exec_context_needs_term(const ExecContext *c) {
 236         assert(c);
 237
 238         /* Return true if the execution context suggests we should set $TERM to something useful. */
 239
 240         if (is_terminal_input(c->std_input))
 241                 return true;
 242
 243         if (is_terminal_output(c->std_output))
 244                 return true;
 245
 246         if (is_terminal_output(c->std_error))
 247                 return true;
 248
 249         return !!c->tty_path;
 250 }
 251
 252 static int open_null_as(int flags, int nfd) {
 253         int fd;
 254
 255         assert(nfd >= 0);
 256
 257         fd = open("/dev/null", flags|O_NOCTTY);
 258         if (fd < 0)
 259                 return -errno;
 260
 261         return move_fd(fd, nfd, false);
 262 }
 263
 264 static int connect_journal_socket(
 265                 int fd,
 266                 const char *log_namespace,
 267                 uid_t uid,
 268                 gid_t gid) {
 269
 270         union sockaddr_union sa;
 271         socklen_t sa_len;
 272         uid_t olduid = UID_INVALID;
 273         gid_t oldgid = GID_INVALID;
 274         const char *j;
 275         int r;
 276
 277         j = log_namespace ?
 278                 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
 279                 "/run/systemd/journal/stdout";
 280         r = sockaddr_un_set_path(&sa.un, j);
 281         if (r < 0)
 282                 return r;
 283         sa_len = r;
 284
 285         if (gid_is_valid(gid)) {
 286                 oldgid = getgid();
 287
 288                 if (setegid(gid) < 0)
 289                         return -errno;
 290         }
 291
 292         if (uid_is_valid(uid)) {
 293                 olduid = getuid();
 294
 295                 if (seteuid(uid) < 0) {
 296                         r = -errno;
 297                         goto restore_gid;
 298                 }
 299         }
 300
 301         r = connect(fd, &sa.sa, sa_len) < 0 ? -errno : 0;
 302
 303         /* If we fail to restore the uid or gid, things will likely
 304            fail later on. This should only happen if an LSM interferes. */
 305
 306         if (uid_is_valid(uid))
 307                 (void) seteuid(olduid);
 308
 309  restore_gid:
 310         if (gid_is_valid(gid))
 311                 (void) setegid(oldgid);
 312
 313         return r;
 314 }
 315
 316 static int connect_logger_as(
 317                 const Unit *unit,
 318                 const ExecContext *context,
 319                 const ExecParameters *params,
 320                 ExecOutput output,
 321                 const char *ident,
 322                 int nfd,
 323                 uid_t uid,
 324                 gid_t gid) {
 325
 326         _cleanup_close_ int fd = -1;
 327         int r;
 328
 329         assert(context);
 330         assert(params);
 331         assert(output < _EXEC_OUTPUT_MAX);
 332         assert(ident);
 333         assert(nfd >= 0);
 334
 335         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 336         if (fd < 0)
 337                 return -errno;
 338
 339         r = connect_journal_socket(fd, context->log_namespace, uid, gid);
 340         if (r < 0)
 341                 return r;
 342
 343         if (shutdown(fd, SHUT_RD) < 0)
 344                 return -errno;
 345
 346         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 347
 348         if (dprintf(fd,
 349                 "%s\n"
 350                 "%s\n"
 351                 "%i\n"
 352                 "%i\n"
 353                 "%i\n"
 354                 "%i\n"
 355                 "%i\n",
 356                 context->syslog_identifier ?: ident,
 357                 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
 358                 context->syslog_priority,
 359                 !!context->syslog_level_prefix,
 360                 false,
 361                 is_kmsg_output(output),
 362                 is_terminal_output(output)) < 0)
 363                 return -errno;
 364
 365         return move_fd(TAKE_FD(fd), nfd, false);
 366 }
 367
 368 static int open_terminal_as(const char *path, int flags, int nfd) {
 369         int fd;
 370
 371         assert(path);
 372         assert(nfd >= 0);
 373
 374         fd = open_terminal(path, flags | O_NOCTTY);
 375         if (fd < 0)
 376                 return fd;
 377
 378         return move_fd(fd, nfd, false);
 379 }
 380
 381 static int acquire_path(const char *path, int flags, mode_t mode) {
 382         union sockaddr_union sa;
 383         socklen_t sa_len;
 384         _cleanup_close_ int fd = -1;
 385         int r;
 386
 387         assert(path);
 388
 389         if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
 390                 flags |= O_CREAT;
 391
 392         fd = open(path, flags|O_NOCTTY, mode);
 393         if (fd >= 0)
 394                 return TAKE_FD(fd);
 395
 396         if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
 397                 return -errno;
 398
 399         /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
 400
 401         r = sockaddr_un_set_path(&sa.un, path);
 402         if (r < 0)
 403                 return r == -EINVAL ? -ENXIO : r;
 404         sa_len = r;
 405
 406         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 407         if (fd < 0)
 408                 return -errno;
 409
 410         if (connect(fd, &sa.sa, sa_len) < 0)
 411                 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
 412                                                            * indication that this wasn't an AF_UNIX socket after all */
 413
 414         if ((flags & O_ACCMODE) == O_RDONLY)
 415                 r = shutdown(fd, SHUT_WR);
 416         else if ((flags & O_ACCMODE) == O_WRONLY)
 417                 r = shutdown(fd, SHUT_RD);
 418         else
 419                 r = 0;
 420         if (r < 0)
 421                 return -errno;
 422
 423         return TAKE_FD(fd);
 424 }
 425
 426 static int fixup_input(
 427                 const ExecContext *context,
 428                 int socket_fd,
 429                 bool apply_tty_stdin) {
 430
 431         ExecInput std_input;
 432
 433         assert(context);
 434
 435         std_input = context->std_input;
 436
 437         if (is_terminal_input(std_input) && !apply_tty_stdin)
 438                 return EXEC_INPUT_NULL;
 439
 440         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 441                 return EXEC_INPUT_NULL;
 442
 443         if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
 444                 return EXEC_INPUT_NULL;
 445
 446         return std_input;
 447 }
 448
 449 static int fixup_output(ExecOutput std_output, int socket_fd) {
 450
 451         if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 452                 return EXEC_OUTPUT_INHERIT;
 453
 454         return std_output;
 455 }
 456
 457 static int setup_input(
 458                 const ExecContext *context,
 459                 const ExecParameters *params,
 460                 int socket_fd,
 461                 const int named_iofds[static 3]) {
 462
 463         ExecInput i;
 464
 465         assert(context);
 466         assert(params);
 467         assert(named_iofds);
 468
 469         if (params->stdin_fd >= 0) {
 470                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 471                         return -errno;
 472
 473                 /* Try to make this the controlling tty, if it is a tty, and reset it */
 474                 if (isatty(STDIN_FILENO)) {
 475                         (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 476                         (void) reset_terminal_fd(STDIN_FILENO, true);
 477                 }
 478
 479                 return STDIN_FILENO;
 480         }
 481
 482         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 483
 484         switch (i) {
 485
 486         case EXEC_INPUT_NULL:
 487                 return open_null_as(O_RDONLY, STDIN_FILENO);
 488
 489         case EXEC_INPUT_TTY:
 490         case EXEC_INPUT_TTY_FORCE:
 491         case EXEC_INPUT_TTY_FAIL: {
 492                 int fd;
 493
 494                 fd = acquire_terminal(exec_context_tty_path(context),
 495                                       i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
 496                                       i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
 497                                                                   ACQUIRE_TERMINAL_WAIT,
 498                                       USEC_INFINITY);
 499                 if (fd < 0)
 500                         return fd;
 501
 502                 return move_fd(fd, STDIN_FILENO, false);
 503         }
 504
 505         case EXEC_INPUT_SOCKET:
 506                 assert(socket_fd >= 0);
 507
 508                 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 509
 510         case EXEC_INPUT_NAMED_FD:
 511                 assert(named_iofds[STDIN_FILENO] >= 0);
 512
 513                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 514                 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 515
 516         case EXEC_INPUT_DATA: {
 517                 int fd;
 518
 519                 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
 520                 if (fd < 0)
 521                         return fd;
 522
 523                 return move_fd(fd, STDIN_FILENO, false);
 524         }
 525
 526         case EXEC_INPUT_FILE: {
 527                 bool rw;
 528                 int fd;
 529
 530                 assert(context->stdio_file[STDIN_FILENO]);
 531
 532                 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
 533                         (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
 534
 535                 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
 536                 if (fd < 0)
 537                         return fd;
 538
 539                 return move_fd(fd, STDIN_FILENO, false);
 540         }
 541
 542         default:
 543                 assert_not_reached("Unknown input type");
 544         }
 545 }
 546
 547 static bool can_inherit_stderr_from_stdout(
 548                 const ExecContext *context,
 549                 ExecOutput o,
 550                 ExecOutput e) {
 551
 552         assert(context);
 553
 554         /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
 555          * stderr fd */
 556
 557         if (e == EXEC_OUTPUT_INHERIT)
 558                 return true;
 559         if (e != o)
 560                 return false;
 561
 562         if (e == EXEC_OUTPUT_NAMED_FD)
 563                 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
 564
 565         if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
 566                 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
 567
 568         return true;
 569 }
 570
 571 static int setup_output(
 572                 const Unit *unit,
 573                 const ExecContext *context,
 574                 const ExecParameters *params,
 575                 int fileno,
 576                 int socket_fd,
 577                 const int named_iofds[static 3],
 578                 const char *ident,
 579                 uid_t uid,
 580                 gid_t gid,
 581                 dev_t *journal_stream_dev,
 582                 ino_t *journal_stream_ino) {
 583
 584         ExecOutput o;
 585         ExecInput i;
 586         int r;
 587
 588         assert(unit);
 589         assert(context);
 590         assert(params);
 591         assert(ident);
 592         assert(journal_stream_dev);
 593         assert(journal_stream_ino);
 594
 595         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 596
 597                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 598                         return -errno;
 599
 600                 return STDOUT_FILENO;
 601         }
 602
 603         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 604                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 605                         return -errno;
 606
 607                 return STDERR_FILENO;
 608         }
 609
 610         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 611         o = fixup_output(context->std_output, socket_fd);
 612
 613         if (fileno == STDERR_FILENO) {
 614                 ExecOutput e;
 615                 e = fixup_output(context->std_error, socket_fd);
 616
 617                 /* This expects the input and output are already set up */
 618
 619                 /* Don't change the stderr file descriptor if we inherit all
 620                  * the way and are not on a tty */
 621                 if (e == EXEC_OUTPUT_INHERIT &&
 622                     o == EXEC_OUTPUT_INHERIT &&
 623                     i == EXEC_INPUT_NULL &&
 624                     !is_terminal_input(context->std_input) &&
 625                     getppid () != 1)
 626                         return fileno;
 627
 628                 /* Duplicate from stdout if possible */
 629                 if (can_inherit_stderr_from_stdout(context, o, e))
 630                         return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
 631
 632                 o = e;
 633
 634         } else if (o == EXEC_OUTPUT_INHERIT) {
 635                 /* If input got downgraded, inherit the original value */
 636                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 637                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 638
 639                 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
 640                 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
 641                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 642
 643                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 644                 if (getppid() != 1)
 645                         return fileno;
 646
 647                 /* We need to open /dev/null here anew, to get the right access mode. */
 648                 return open_null_as(O_WRONLY, fileno);
 649         }
 650
 651         switch (o) {
 652
 653         case EXEC_OUTPUT_NULL:
 654                 return open_null_as(O_WRONLY, fileno);
 655
 656         case EXEC_OUTPUT_TTY:
 657                 if (is_terminal_input(i))
 658                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 659
 660                 /* We don't reset the terminal if this is just about output */
 661                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 662
 663         case EXEC_OUTPUT_KMSG:
 664         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 665         case EXEC_OUTPUT_JOURNAL:
 666         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 667                 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
 668                 if (r < 0) {
 669                         log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
 670                         r = open_null_as(O_WRONLY, fileno);
 671                 } else {
 672                         struct stat st;
 673
 674                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 675                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 676                          * services to detect whether they are connected to the journal or not.
 677                          *
 678                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
 679                          * about STDERR as that's usually the best way to do logging. */
 680
 681                         if (fstat(fileno, &st) >= 0 &&
 682                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
 683                                 *journal_stream_dev = st.st_dev;
 684                                 *journal_stream_ino = st.st_ino;
 685                         }
 686                 }
 687                 return r;
 688
 689         case EXEC_OUTPUT_SOCKET:
 690                 assert(socket_fd >= 0);
 691
 692                 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
 693
 694         case EXEC_OUTPUT_NAMED_FD:
 695                 assert(named_iofds[fileno] >= 0);
 696
 697                 (void) fd_nonblock(named_iofds[fileno], false);
 698                 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
 699
 700         case EXEC_OUTPUT_FILE:
 701         case EXEC_OUTPUT_FILE_APPEND:
 702         case EXEC_OUTPUT_FILE_TRUNCATE: {
 703                 bool rw;
 704                 int fd, flags;
 705
 706                 assert(context->stdio_file[fileno]);
 707
 708                 rw = context->std_input == EXEC_INPUT_FILE &&
 709                         streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
 710
 711                 if (rw)
 712                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 713
 714                 flags = O_WRONLY;
 715                 if (o == EXEC_OUTPUT_FILE_APPEND)
 716                         flags |= O_APPEND;
 717                 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
 718                         flags |= O_TRUNC;
 719
 720                 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
 721                 if (fd < 0)
 722                         return fd;
 723
 724                 return move_fd(fd, fileno, 0);
 725         }
 726
 727         default:
 728                 assert_not_reached("Unknown error type");
 729         }
 730 }
 731
 732 static int chown_terminal(int fd, uid_t uid) {
 733         int r;
 734
 735         assert(fd >= 0);
 736
 737         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 738         if (isatty(fd) < 1) {
 739                 if (IN_SET(errno, EINVAL, ENOTTY))
 740                         return 0; /* not a tty */
 741
 742                 return -errno;
 743         }
 744
 745         /* This might fail. What matters are the results. */
 746         r = fchmod_and_chown(fd, TTY_MODE, uid, -1);
 747         if (r < 0)
 748                 return r;
 749
 750         return 1;
 751 }
 752
 753 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
 754         _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
 755         int r;
 756
 757         assert(_saved_stdin);
 758         assert(_saved_stdout);
 759
 760         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 761         if (saved_stdin < 0)
 762                 return -errno;
 763
 764         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
 765         if (saved_stdout < 0)
 766                 return -errno;
 767
 768         fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
 769         if (fd < 0)
 770                 return fd;
 771
 772         r = chown_terminal(fd, getuid());
 773         if (r < 0)
 774                 return r;
 775
 776         r = reset_terminal_fd(fd, true);
 777         if (r < 0)
 778                 return r;
 779
 780         r = rearrange_stdio(fd, fd, STDERR_FILENO);
 781         fd = -1;
 782         if (r < 0)
 783                 return r;
 784
 785         *_saved_stdin = saved_stdin;
 786         *_saved_stdout = saved_stdout;
 787
 788         saved_stdin = saved_stdout = -1;
 789
 790         return 0;
 791 }
 792
 793 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
 794         assert(err < 0);
 795
 796         if (err == -ETIMEDOUT)
 797                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
 798         else {
 799                 errno = -err;
 800                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
 801         }
 802 }
 803
 804 static void write_confirm_error(int err, const char *vc, const Unit *u) {
 805         _cleanup_close_ int fd = -1;
 806
 807         assert(vc);
 808
 809         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 810         if (fd < 0)
 811                 return;
 812
 813         write_confirm_error_fd(err, fd, u);
 814 }
 815
 816 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 817         int r = 0;
 818
 819         assert(saved_stdin);
 820         assert(saved_stdout);
 821
 822         release_terminal();
 823
 824         if (*saved_stdin >= 0)
 825                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 826                         r = -errno;
 827
 828         if (*saved_stdout >= 0)
 829                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 830                         r = -errno;
 831
 832         *saved_stdin = safe_close(*saved_stdin);
 833         *saved_stdout = safe_close(*saved_stdout);
 834
 835         return r;
 836 }
 837
 838 enum {
 839         CONFIRM_PRETEND_FAILURE = -1,
 840         CONFIRM_PRETEND_SUCCESS =  0,
 841         CONFIRM_EXECUTE = 1,
 842 };
 843
 844 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
 845         int saved_stdout = -1, saved_stdin = -1, r;
 846         _cleanup_free_ char *e = NULL;
 847         char c;
 848
 849         /* For any internal errors, assume a positive response. */
 850         r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
 851         if (r < 0) {
 852                 write_confirm_error(r, vc, u);
 853                 return CONFIRM_EXECUTE;
 854         }
 855
 856         /* confirm_spawn might have been disabled while we were sleeping. */
 857         if (manager_is_confirm_spawn_disabled(u->manager)) {
 858                 r = 1;
 859                 goto restore_stdio;
 860         }
 861
 862         e = ellipsize(cmdline, 60, 100);
 863         if (!e) {
 864                 log_oom();
 865                 r = CONFIRM_EXECUTE;
 866                 goto restore_stdio;
 867         }
 868
 869         for (;;) {
 870                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 871                 if (r < 0) {
 872                         write_confirm_error_fd(r, STDOUT_FILENO, u);
 873                         r = CONFIRM_EXECUTE;
 874                         goto restore_stdio;
 875                 }
 876
 877                 switch (c) {
 878                 case 'c':
 879                         printf("Resuming normal execution.\n");
 880                         manager_disable_confirm_spawn();
 881                         r = 1;
 882                         break;
 883                 case 'D':
 884                         unit_dump(u, stdout, "  ");
 885                         continue; /* ask again */
 886                 case 'f':
 887                         printf("Failing execution.\n");
 888                         r = CONFIRM_PRETEND_FAILURE;
 889                         break;
 890                 case 'h':
 891                         printf("  c - continue, proceed without asking anymore\n"
 892                                "  D - dump, show the state of the unit\n"
 893                                "  f - fail, don't execute the command and pretend it failed\n"
 894                                "  h - help\n"
 895                                "  i - info, show a short summary of the unit\n"
 896                                "  j - jobs, show jobs that are in progress\n"
 897                                "  s - skip, don't execute the command and pretend it succeeded\n"
 898                                "  y - yes, execute the command\n");
 899                         continue; /* ask again */
 900                 case 'i':
 901                         printf("  Description: %s\n"
 902                                "  Unit:        %s\n"
 903                                "  Command:     %s\n",
 904                                u->id, u->description, cmdline);
 905                         continue; /* ask again */
 906                 case 'j':
 907                         manager_dump_jobs(u->manager, stdout, "  ");
 908                         continue; /* ask again */
 909                 case 'n':
 910                         /* 'n' was removed in favor of 'f'. */
 911                         printf("Didn't understand 'n', did you mean 'f'?\n");
 912                         continue; /* ask again */
 913                 case 's':
 914                         printf("Skipping execution.\n");
 915                         r = CONFIRM_PRETEND_SUCCESS;
 916                         break;
 917                 case 'y':
 918                         r = CONFIRM_EXECUTE;
 919                         break;
 920                 default:
 921                         assert_not_reached("Unhandled choice");
 922                 }
 923                 break;
 924         }
 925
 926 restore_stdio:
 927         restore_confirm_stdio(&saved_stdin, &saved_stdout);
 928         return r;
 929 }
 930
 931 static int get_fixed_user(const ExecContext *c, const char **user,
 932                           uid_t *uid, gid_t *gid,
 933                           const char **home, const char **shell) {
 934         int r;
 935         const char *name;
 936
 937         assert(c);
 938
 939         if (!c->user)
 940                 return 0;
 941
 942         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
 943          * (i.e. are "/" or "/bin/nologin"). */
 944
 945         name = c->user;
 946         r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
 947         if (r < 0)
 948                 return r;
 949
 950         *user = name;
 951         return 0;
 952 }
 953
 954 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
 955         int r;
 956         const char *name;
 957
 958         assert(c);
 959
 960         if (!c->group)
 961                 return 0;
 962
 963         name = c->group;
 964         r = get_group_creds(&name, gid, 0);
 965         if (r < 0)
 966                 return r;
 967
 968         *group = name;
 969         return 0;
 970 }
 971
 972 static int get_supplementary_groups(const ExecContext *c, const char *user,
 973                                     const char *group, gid_t gid,
 974                                     gid_t **supplementary_gids, int *ngids) {
 975         char **i;
 976         int r, k = 0;
 977         int ngroups_max;
 978         bool keep_groups = false;
 979         gid_t *groups = NULL;
 980         _cleanup_free_ gid_t *l_gids = NULL;
 981
 982         assert(c);
 983
 984         /*
 985          * If user is given, then lookup GID and supplementary groups list.
 986          * We avoid NSS lookups for gid=0. Also we have to initialize groups
 987          * here and as early as possible so we keep the list of supplementary
 988          * groups of the caller.
 989          */
 990         if (user && gid_is_valid(gid) && gid != 0) {
 991                 /* First step, initialize groups from /etc/groups */
 992                 if (initgroups(user, gid) < 0)
 993                         return -errno;
 994
 995                 keep_groups = true;
 996         }
 997
 998         if (strv_isempty(c->supplementary_groups))
 999                 return 0;
1000
1001         /*
1002          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1003          * be positive, otherwise fail.
1004          */
1005         errno = 0;
1006         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1007         if (ngroups_max <= 0)
1008                 return errno_or_else(EOPNOTSUPP);
1009
1010         l_gids = new(gid_t, ngroups_max);
1011         if (!l_gids)
1012                 return -ENOMEM;
1013
1014         if (keep_groups) {
1015                 /*
1016                  * Lookup the list of groups that the user belongs to, we
1017                  * avoid NSS lookups here too for gid=0.
1018                  */
1019                 k = ngroups_max;
1020                 if (getgrouplist(user, gid, l_gids, &k) < 0)
1021                         return -EINVAL;
1022         } else
1023                 k = 0;
1024
1025         STRV_FOREACH(i, c->supplementary_groups) {
1026                 const char *g;
1027
1028                 if (k >= ngroups_max)
1029                         return -E2BIG;
1030
1031                 g = *i;
1032                 r = get_group_creds(&g, l_gids+k, 0);
1033                 if (r < 0)
1034                         return r;
1035
1036                 k++;
1037         }
1038
1039         /*
1040          * Sets ngids to zero to drop all supplementary groups, happens
1041          * when we are under root and SupplementaryGroups= is empty.
1042          */
1043         if (k == 0) {
1044                 *ngids = 0;
1045                 return 0;
1046         }
1047
1048         /* Otherwise get the final list of supplementary groups */
1049         groups = memdup(l_gids, sizeof(gid_t) * k);
1050         if (!groups)
1051                 return -ENOMEM;
1052
1053         *supplementary_gids = groups;
1054         *ngids = k;
1055
1056         groups = NULL;
1057
1058         return 0;
1059 }
1060
1061 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1062         int r;
1063
1064         /* Handle SupplementaryGroups= if it is not empty */
1065         if (ngids > 0) {
1066                 r = maybe_setgroups(ngids, supplementary_gids);
1067                 if (r < 0)
1068                         return r;
1069         }
1070
1071         if (gid_is_valid(gid)) {
1072                 /* Then set our gids */
1073                 if (setresgid(gid, gid, gid) < 0)
1074                         return -errno;
1075         }
1076
1077         return 0;
1078 }
1079
1080 static int set_securebits(int bits, int mask) {
1081         int current, applied;
1082         current = prctl(PR_GET_SECUREBITS);
1083         if (current < 0)
1084                 return -errno;
1085         /* Clear all securebits defined in mask and set bits */
1086         applied = (current & ~mask) | bits;
1087         if (current == applied)
1088                 return 0;
1089         if (prctl(PR_SET_SECUREBITS, applied) < 0)
1090                 return -errno;
1091         return 1;
1092 }
1093
1094 static int enforce_user(const ExecContext *context, uid_t uid) {
1095         assert(context);
1096         int r;
1097
1098         if (!uid_is_valid(uid))
1099                 return 0;
1100
1101         /* Sets (but doesn't look up) the uid and make sure we keep the
1102          * capabilities while doing so. For setting secure bits the capability CAP_SETPCAP is
1103          * required, so we also need keep-caps in this case.
1104          */
1105
1106         if (context->capability_ambient_set != 0 || context->secure_bits != 0) {
1107
1108                 /* First step: If we need to keep capabilities but
1109                  * drop privileges we need to make sure we keep our
1110                  * caps, while we drop privileges. */
1111                 if (uid != 0) {
1112                         /* Add KEEP_CAPS to the securebits */
1113                         r = set_securebits(1<<SECURE_KEEP_CAPS, 0);
1114                         if (r < 0)
1115                                 return r;
1116                 }
1117         }
1118
1119         /* Second step: actually set the uids */
1120         if (setresuid(uid, uid, uid) < 0)
1121                 return -errno;
1122
1123         /* At this point we should have all necessary capabilities but
1124            are otherwise a normal user. However, the caps might got
1125            corrupted due to the setresuid() so we need clean them up
1126            later. This is done outside of this call. */
1127
1128         return 0;
1129 }
1130
1131 #if HAVE_PAM
1132
1133 static int null_conv(
1134                 int num_msg,
1135                 const struct pam_message **msg,
1136                 struct pam_response **resp,
1137                 void *appdata_ptr) {
1138
1139         /* We don't support conversations */
1140
1141         return PAM_CONV_ERR;
1142 }
1143
1144 #endif
1145
1146 static int setup_pam(
1147                 const char *name,
1148                 const char *user,
1149                 uid_t uid,
1150                 gid_t gid,
1151                 const char *tty,
1152                 char ***env,
1153                 const int fds[], size_t n_fds) {
1154
1155 #if HAVE_PAM
1156
1157         static const struct pam_conv conv = {
1158                 .conv = null_conv,
1159                 .appdata_ptr = NULL
1160         };
1161
1162         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1163         pam_handle_t *handle = NULL;
1164         sigset_t old_ss;
1165         int pam_code = PAM_SUCCESS, r;
1166         char **nv, **e = NULL;
1167         bool close_session = false;
1168         pid_t pam_pid = 0, parent_pid;
1169         int flags = 0;
1170
1171         assert(name);
1172         assert(user);
1173         assert(env);
1174
1175         /* We set up PAM in the parent process, then fork. The child
1176          * will then stay around until killed via PR_GET_PDEATHSIG or
1177          * systemd via the cgroup logic. It will then remove the PAM
1178          * session again. The parent process will exec() the actual
1179          * daemon. We do things this way to ensure that the main PID
1180          * of the daemon is the one we initially fork()ed. */
1181
1182         r = barrier_create(&barrier);
1183         if (r < 0)
1184                 goto fail;
1185
1186         if (log_get_max_level() < LOG_DEBUG)
1187                 flags |= PAM_SILENT;
1188
1189         pam_code = pam_start(name, user, &conv, &handle);
1190         if (pam_code != PAM_SUCCESS) {
1191                 handle = NULL;
1192                 goto fail;
1193         }
1194
1195         if (!tty) {
1196                 _cleanup_free_ char *q = NULL;
1197
1198                 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1199                  * out if that's the case, and read the TTY off it. */
1200
1201                 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1202                         tty = strjoina("/dev/", q);
1203         }
1204
1205         if (tty) {
1206                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1207                 if (pam_code != PAM_SUCCESS)
1208                         goto fail;
1209         }
1210
1211         STRV_FOREACH(nv, *env) {
1212                 pam_code = pam_putenv(handle, *nv);
1213                 if (pam_code != PAM_SUCCESS)
1214                         goto fail;
1215         }
1216
1217         pam_code = pam_acct_mgmt(handle, flags);
1218         if (pam_code != PAM_SUCCESS)
1219                 goto fail;
1220
1221         pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1222         if (pam_code != PAM_SUCCESS)
1223                 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
1224
1225         pam_code = pam_open_session(handle, flags);
1226         if (pam_code != PAM_SUCCESS)
1227                 goto fail;
1228
1229         close_session = true;
1230
1231         e = pam_getenvlist(handle);
1232         if (!e) {
1233                 pam_code = PAM_BUF_ERR;
1234                 goto fail;
1235         }
1236
1237         /* Block SIGTERM, so that we know that it won't get lost in
1238          * the child */
1239
1240         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1241
1242         parent_pid = getpid_cached();
1243
1244         r = safe_fork("(sd-pam)", 0, &pam_pid);
1245         if (r < 0)
1246                 goto fail;
1247         if (r == 0) {
1248                 int sig, ret = EXIT_PAM;
1249
1250                 /* The child's job is to reset the PAM session on
1251                  * termination */
1252                 barrier_set_role(&barrier, BARRIER_CHILD);
1253
1254                 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1255                  * those fds are open here that have been opened by PAM. */
1256                 (void) close_many(fds, n_fds);
1257
1258                 /* Drop privileges - we don't need any to pam_close_session
1259                  * and this will make PR_SET_PDEATHSIG work in most cases.
1260                  * If this fails, ignore the error - but expect sd-pam threads
1261                  * to fail to exit normally */
1262
1263                 r = maybe_setgroups(0, NULL);
1264                 if (r < 0)
1265                         log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1266                 if (setresgid(gid, gid, gid) < 0)
1267                         log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1268                 if (setresuid(uid, uid, uid) < 0)
1269                         log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1270
1271                 (void) ignore_signals(SIGPIPE, -1);
1272
1273                 /* Wait until our parent died. This will only work if
1274                  * the above setresuid() succeeds, otherwise the kernel
1275                  * will not allow unprivileged parents kill their privileged
1276                  * children this way. We rely on the control groups kill logic
1277                  * to do the rest for us. */
1278                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1279                         goto child_finish;
1280
1281                 /* Tell the parent that our setup is done. This is especially
1282                  * important regarding dropping privileges. Otherwise, unit
1283                  * setup might race against our setresuid(2) call.
1284                  *
1285                  * If the parent aborted, we'll detect this below, hence ignore
1286                  * return failure here. */
1287                 (void) barrier_place(&barrier);
1288
1289                 /* Check if our parent process might already have died? */
1290                 if (getppid() == parent_pid) {
1291                         sigset_t ss;
1292
1293                         assert_se(sigemptyset(&ss) >= 0);
1294                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1295
1296                         for (;;) {
1297                                 if (sigwait(&ss, &sig) < 0) {
1298                                         if (errno == EINTR)
1299                                                 continue;
1300
1301                                         goto child_finish;
1302                                 }
1303
1304                                 assert(sig == SIGTERM);
1305                                 break;
1306                         }
1307                 }
1308
1309                 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1310                 if (pam_code != PAM_SUCCESS)
1311                         goto child_finish;
1312
1313                 /* If our parent died we'll end the session */
1314                 if (getppid() != parent_pid) {
1315                         pam_code = pam_close_session(handle, flags);
1316                         if (pam_code != PAM_SUCCESS)
1317                                 goto child_finish;
1318                 }
1319
1320                 ret = 0;
1321
1322         child_finish:
1323                 pam_end(handle, pam_code | flags);
1324                 _exit(ret);
1325         }
1326
1327         barrier_set_role(&barrier, BARRIER_PARENT);
1328
1329         /* If the child was forked off successfully it will do all the
1330          * cleanups, so forget about the handle here. */
1331         handle = NULL;
1332
1333         /* Unblock SIGTERM again in the parent */
1334         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1335
1336         /* We close the log explicitly here, since the PAM modules
1337          * might have opened it, but we don't want this fd around. */
1338         closelog();
1339
1340         /* Synchronously wait for the child to initialize. We don't care for
1341          * errors as we cannot recover. However, warn loudly if it happens. */
1342         if (!barrier_place_and_sync(&barrier))
1343                 log_error("PAM initialization failed");
1344
1345         return strv_free_and_replace(*env, e);
1346
1347 fail:
1348         if (pam_code != PAM_SUCCESS) {
1349                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1350                 r = -EPERM;  /* PAM errors do not map to errno */
1351         } else
1352                 log_error_errno(r, "PAM failed: %m");
1353
1354         if (handle) {
1355                 if (close_session)
1356                         pam_code = pam_close_session(handle, flags);
1357
1358                 pam_end(handle, pam_code | flags);
1359         }
1360
1361         strv_free(e);
1362         closelog();
1363
1364         return r;
1365 #else
1366         return 0;
1367 #endif
1368 }
1369
1370 static void rename_process_from_path(const char *path) {
1371         char process_name[11];
1372         const char *p;
1373         size_t l;
1374
1375         /* This resulting string must fit in 10 chars (i.e. the length
1376          * of "/sbin/init") to look pretty in /bin/ps */
1377
1378         p = basename(path);
1379         if (isempty(p)) {
1380                 rename_process("(...)");
1381                 return;
1382         }
1383
1384         l = strlen(p);
1385         if (l > 8) {
1386                 /* The end of the process name is usually more
1387                  * interesting, since the first bit might just be
1388                  * "systemd-" */
1389                 p = p + l - 8;
1390                 l = 8;
1391         }
1392
1393         process_name[0] = '(';
1394         memcpy(process_name+1, p, l);
1395         process_name[1+l] = ')';
1396         process_name[1+l+1] = 0;
1397
1398         rename_process(process_name);
1399 }
1400
1401 static bool context_has_address_families(const ExecContext *c) {
1402         assert(c);
1403
1404         return c->address_families_allow_list ||
1405                 !set_isempty(c->address_families);
1406 }
1407
1408 static bool context_has_syscall_filters(const ExecContext *c) {
1409         assert(c);
1410
1411         return c->syscall_allow_list ||
1412                 !hashmap_isempty(c->syscall_filter);
1413 }
1414
1415 static bool context_has_syscall_logs(const ExecContext *c) {
1416         assert(c);
1417
1418         return c->syscall_log_allow_list ||
1419                 !hashmap_isempty(c->syscall_log);
1420 }
1421
1422 static bool context_has_no_new_privileges(const ExecContext *c) {
1423         assert(c);
1424
1425         if (c->no_new_privileges)
1426                 return true;
1427
1428         if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1429                 return false;
1430
1431         /* We need NNP if we have any form of seccomp and are unprivileged */
1432         return context_has_address_families(c) ||
1433                 c->memory_deny_write_execute ||
1434                 c->restrict_realtime ||
1435                 c->restrict_suid_sgid ||
1436                 exec_context_restrict_namespaces_set(c) ||
1437                 c->protect_clock ||
1438                 c->protect_kernel_tunables ||
1439                 c->protect_kernel_modules ||
1440                 c->protect_kernel_logs ||
1441                 c->private_devices ||
1442                 context_has_syscall_filters(c) ||
1443                 context_has_syscall_logs(c) ||
1444                 !set_isempty(c->syscall_archs) ||
1445                 c->lock_personality ||
1446                 c->protect_hostname;
1447 }
1448
1449 static bool exec_context_has_credentials(const ExecContext *context) {
1450
1451         assert(context);
1452
1453         return !hashmap_isempty(context->set_credentials) ||
1454                 context->load_credentials;
1455 }
1456
1457 #if HAVE_SECCOMP
1458
1459 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1460
1461         if (is_seccomp_available())
1462                 return false;
1463
1464         log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1465         return true;
1466 }
1467
1468 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1469         uint32_t negative_action, default_action, action;
1470         int r;
1471
1472         assert(u);
1473         assert(c);
1474
1475         if (!context_has_syscall_filters(c))
1476                 return 0;
1477
1478         if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1479                 return 0;
1480
1481         negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1482
1483         if (c->syscall_allow_list) {
1484                 default_action = negative_action;
1485                 action = SCMP_ACT_ALLOW;
1486         } else {
1487                 default_action = SCMP_ACT_ALLOW;
1488                 action = negative_action;
1489         }
1490
1491         if (needs_ambient_hack) {
1492                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1493                 if (r < 0)
1494                         return r;
1495         }
1496
1497         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1498 }
1499
1500 static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1501 #ifdef SCMP_ACT_LOG
1502         uint32_t default_action, action;
1503 #endif
1504
1505         assert(u);
1506         assert(c);
1507
1508         if (!context_has_syscall_logs(c))
1509                 return 0;
1510
1511 #ifdef SCMP_ACT_LOG
1512         if (skip_seccomp_unavailable(u, "SystemCallLog="))
1513                 return 0;
1514
1515         if (c->syscall_log_allow_list) {
1516                 /* Log nothing but the ones listed */
1517                 default_action = SCMP_ACT_ALLOW;
1518                 action = SCMP_ACT_LOG;
1519         } else {
1520                 /* Log everything but the ones listed */
1521                 default_action = SCMP_ACT_LOG;
1522                 action = SCMP_ACT_ALLOW;
1523         }
1524
1525         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1526 #else
1527         /* old libseccomp */
1528         log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1529         return 0;
1530 #endif
1531 }
1532
1533 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1534         assert(u);
1535         assert(c);
1536
1537         if (set_isempty(c->syscall_archs))
1538                 return 0;
1539
1540         if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1541                 return 0;
1542
1543         return seccomp_restrict_archs(c->syscall_archs);
1544 }
1545
1546 static int apply_address_families(const Unit* u, const ExecContext *c) {
1547         assert(u);
1548         assert(c);
1549
1550         if (!context_has_address_families(c))
1551                 return 0;
1552
1553         if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1554                 return 0;
1555
1556         return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1557 }
1558
1559 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1560         assert(u);
1561         assert(c);
1562
1563         if (!c->memory_deny_write_execute)
1564                 return 0;
1565
1566         if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1567                 return 0;
1568
1569         return seccomp_memory_deny_write_execute();
1570 }
1571
1572 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1573         assert(u);
1574         assert(c);
1575
1576         if (!c->restrict_realtime)
1577                 return 0;
1578
1579         if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1580                 return 0;
1581
1582         return seccomp_restrict_realtime();
1583 }
1584
1585 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1586         assert(u);
1587         assert(c);
1588
1589         if (!c->restrict_suid_sgid)
1590                 return 0;
1591
1592         if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1593                 return 0;
1594
1595         return seccomp_restrict_suid_sgid();
1596 }
1597
1598 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1599         assert(u);
1600         assert(c);
1601
1602         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1603          * let's protect even those systems where this is left on in the kernel. */
1604
1605         if (!c->protect_kernel_tunables)
1606                 return 0;
1607
1608         if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1609                 return 0;
1610
1611         return seccomp_protect_sysctl();
1612 }
1613
1614 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1615         assert(u);
1616         assert(c);
1617
1618         /* Turn off module syscalls on ProtectKernelModules=yes */
1619
1620         if (!c->protect_kernel_modules)
1621                 return 0;
1622
1623         if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1624                 return 0;
1625
1626         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1627 }
1628
1629 static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1630         assert(u);
1631         assert(c);
1632
1633         if (!c->protect_kernel_logs)
1634                 return 0;
1635
1636         if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1637                 return 0;
1638
1639         return seccomp_protect_syslog();
1640 }
1641
1642 static int apply_protect_clock(const Unit *u, const ExecContext *c) {
1643         assert(u);
1644         assert(c);
1645
1646         if (!c->protect_clock)
1647                 return 0;
1648
1649         if (skip_seccomp_unavailable(u, "ProtectClock="))
1650                 return 0;
1651
1652         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1653 }
1654
1655 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1656         assert(u);
1657         assert(c);
1658
1659         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1660
1661         if (!c->private_devices)
1662                 return 0;
1663
1664         if (skip_seccomp_unavailable(u, "PrivateDevices="))
1665                 return 0;
1666
1667         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1668 }
1669
1670 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1671         assert(u);
1672         assert(c);
1673
1674         if (!exec_context_restrict_namespaces_set(c))
1675                 return 0;
1676
1677         if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1678                 return 0;
1679
1680         return seccomp_restrict_namespaces(c->restrict_namespaces);
1681 }
1682
1683 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1684         unsigned long personality;
1685         int r;
1686
1687         assert(u);
1688         assert(c);
1689
1690         if (!c->lock_personality)
1691                 return 0;
1692
1693         if (skip_seccomp_unavailable(u, "LockPersonality="))
1694                 return 0;
1695
1696         personality = c->personality;
1697
1698         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1699         if (personality == PERSONALITY_INVALID) {
1700
1701                 r = opinionated_personality(&personality);
1702                 if (r < 0)
1703                         return r;
1704         }
1705
1706         return seccomp_lock_personality(personality);
1707 }
1708
1709 #endif
1710
1711 static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
1712         assert(u);
1713         assert(c);
1714
1715         if (!c->protect_hostname)
1716                 return 0;
1717
1718         if (ns_type_supported(NAMESPACE_UTS)) {
1719                 if (unshare(CLONE_NEWUTS) < 0) {
1720                         if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1721                                 *ret_exit_status = EXIT_NAMESPACE;
1722                                 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1723                         }
1724
1725                         log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1726                 }
1727         } else
1728                 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1729
1730 #if HAVE_SECCOMP
1731         int r;
1732
1733         if (skip_seccomp_unavailable(u, "ProtectHostname="))
1734                 return 0;
1735
1736         r = seccomp_protect_hostname();
1737         if (r < 0) {
1738                 *ret_exit_status = EXIT_SECCOMP;
1739                 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1740         }
1741 #endif
1742
1743         return 0;
1744 }
1745
1746 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1747         assert(idle_pipe);
1748
1749         idle_pipe[1] = safe_close(idle_pipe[1]);
1750         idle_pipe[2] = safe_close(idle_pipe[2]);
1751
1752         if (idle_pipe[0] >= 0) {
1753                 int r;
1754
1755                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1756
1757                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1758                         ssize_t n;
1759
1760                         /* Signal systemd that we are bored and want to continue. */
1761                         n = write(idle_pipe[3], "x", 1);
1762                         if (n > 0)
1763                                 /* Wait for systemd to react to the signal above. */
1764                                 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1765                 }
1766
1767                 idle_pipe[0] = safe_close(idle_pipe[0]);
1768
1769         }
1770
1771         idle_pipe[3] = safe_close(idle_pipe[3]);
1772 }
1773
1774 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1775
1776 static int build_environment(
1777                 const Unit *u,
1778                 const ExecContext *c,
1779                 const ExecParameters *p,
1780                 size_t n_fds,
1781                 const char *home,
1782                 const char *username,
1783                 const char *shell,
1784                 dev_t journal_stream_dev,
1785                 ino_t journal_stream_ino,
1786                 char ***ret) {
1787
1788         _cleanup_strv_free_ char **our_env = NULL;
1789         size_t n_env = 0;
1790         char *x;
1791
1792         assert(u);
1793         assert(c);
1794         assert(p);
1795         assert(ret);
1796
1797 #define N_ENV_VARS 16
1798         our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1799         if (!our_env)
1800                 return -ENOMEM;
1801
1802         if (n_fds > 0) {
1803                 _cleanup_free_ char *joined = NULL;
1804
1805                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1806                         return -ENOMEM;
1807                 our_env[n_env++] = x;
1808
1809                 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1810                         return -ENOMEM;
1811                 our_env[n_env++] = x;
1812
1813                 joined = strv_join(p->fd_names, ":");
1814                 if (!joined)
1815                         return -ENOMEM;
1816
1817                 x = strjoin("LISTEN_FDNAMES=", joined);
1818                 if (!x)
1819                         return -ENOMEM;
1820                 our_env[n_env++] = x;
1821         }
1822
1823         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1824                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1825                         return -ENOMEM;
1826                 our_env[n_env++] = x;
1827
1828                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1829                         return -ENOMEM;
1830                 our_env[n_env++] = x;
1831         }
1832
1833         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1834          * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1835          * check the database directly. */
1836         if (p->flags & EXEC_NSS_BYPASS_BUS) {
1837                 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1838                 if (!x)
1839                         return -ENOMEM;
1840                 our_env[n_env++] = x;
1841         }
1842
1843         if (home) {
1844                 x = strjoin("HOME=", home);
1845                 if (!x)
1846                         return -ENOMEM;
1847
1848                 path_simplify(x + 5, true);
1849                 our_env[n_env++] = x;
1850         }
1851
1852         if (username) {
1853                 x = strjoin("LOGNAME=", username);
1854                 if (!x)
1855                         return -ENOMEM;
1856                 our_env[n_env++] = x;
1857
1858                 x = strjoin("USER=", username);
1859                 if (!x)
1860                         return -ENOMEM;
1861                 our_env[n_env++] = x;
1862         }
1863
1864         if (shell) {
1865                 x = strjoin("SHELL=", shell);
1866                 if (!x)
1867                         return -ENOMEM;
1868
1869                 path_simplify(x + 6, true);
1870                 our_env[n_env++] = x;
1871         }
1872
1873         if (!sd_id128_is_null(u->invocation_id)) {
1874                 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1875                         return -ENOMEM;
1876
1877                 our_env[n_env++] = x;
1878         }
1879
1880         if (exec_context_needs_term(c)) {
1881                 const char *tty_path, *term = NULL;
1882
1883                 tty_path = exec_context_tty_path(c);
1884
1885                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1886                  * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1887                  * container manager passes to PID 1 ends up all the way in the console login shown. */
1888
1889                 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
1890                         term = getenv("TERM");
1891
1892                 if (!term)
1893                         term = default_term_for_tty(tty_path);
1894
1895                 x = strjoin("TERM=", term);
1896                 if (!x)
1897                         return -ENOMEM;
1898                 our_env[n_env++] = x;
1899         }
1900
1901         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1902                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1903                         return -ENOMEM;
1904
1905                 our_env[n_env++] = x;
1906         }
1907
1908         if (c->log_namespace) {
1909                 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1910                 if (!x)
1911                         return -ENOMEM;
1912
1913                 our_env[n_env++] = x;
1914         }
1915
1916         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1917                 _cleanup_free_ char *pre = NULL, *joined = NULL;
1918                 const char *n;
1919
1920                 if (!p->prefix[t])
1921                         continue;
1922
1923                 if (strv_isempty(c->directories[t].paths))
1924                         continue;
1925
1926                 n = exec_directory_env_name_to_string(t);
1927                 if (!n)
1928                         continue;
1929
1930                 pre = strjoin(p->prefix[t], "/");
1931                 if (!pre)
1932                         return -ENOMEM;
1933
1934                 joined = strv_join_full(c->directories[t].paths, ":", pre, true);
1935                 if (!joined)
1936                         return -ENOMEM;
1937
1938                 x = strjoin(n, "=", joined);
1939                 if (!x)
1940                         return -ENOMEM;
1941
1942                 our_env[n_env++] = x;
1943         }
1944
1945         if (exec_context_has_credentials(c) && p->prefix[EXEC_DIRECTORY_RUNTIME]) {
1946                 x = strjoin("CREDENTIALS_DIRECTORY=", p->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
1947                 if (!x)
1948                         return -ENOMEM;
1949
1950                 our_env[n_env++] = x;
1951         }
1952
1953         our_env[n_env++] = NULL;
1954         assert(n_env <= N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1955 #undef N_ENV_VARS
1956
1957         *ret = TAKE_PTR(our_env);
1958
1959         return 0;
1960 }
1961
1962 static int build_pass_environment(const ExecContext *c, char ***ret) {
1963         _cleanup_strv_free_ char **pass_env = NULL;
1964         size_t n_env = 0, n_bufsize = 0;
1965         char **i;
1966
1967         STRV_FOREACH(i, c->pass_environment) {
1968                 _cleanup_free_ char *x = NULL;
1969                 char *v;
1970
1971                 v = getenv(*i);
1972                 if (!v)
1973                         continue;
1974                 x = strjoin(*i, "=", v);
1975                 if (!x)
1976                         return -ENOMEM;
1977
1978                 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1979                         return -ENOMEM;
1980
1981                 pass_env[n_env++] = TAKE_PTR(x);
1982                 pass_env[n_env] = NULL;
1983         }
1984
1985         *ret = TAKE_PTR(pass_env);
1986
1987         return 0;
1988 }
1989
1990 bool exec_needs_mount_namespace(
1991                 const ExecContext *context,
1992                 const ExecParameters *params,
1993                 const ExecRuntime *runtime) {
1994
1995         assert(context);
1996
1997         if (context->root_image)
1998                 return true;
1999
2000         if (!strv_isempty(context->read_write_paths) ||
2001             !strv_isempty(context->read_only_paths) ||
2002             !strv_isempty(context->inaccessible_paths) ||
2003             !strv_isempty(context->exec_paths) ||
2004             !strv_isempty(context->no_exec_paths))
2005                 return true;
2006
2007         if (context->n_bind_mounts > 0)
2008                 return true;
2009
2010         if (context->n_temporary_filesystems > 0)
2011                 return true;
2012
2013         if (context->n_mount_images > 0)
2014                 return true;
2015
2016         if (!IN_SET(context->mount_flags, 0, MS_SHARED))
2017                 return true;
2018
2019         if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
2020                 return true;
2021
2022         if (context->private_devices ||
2023             context->private_mounts ||
2024             context->protect_system != PROTECT_SYSTEM_NO ||
2025             context->protect_home != PROTECT_HOME_NO ||
2026             context->protect_kernel_tunables ||
2027             context->protect_kernel_modules ||
2028             context->protect_kernel_logs ||
2029             context->protect_control_groups ||
2030             context->protect_proc != PROTECT_PROC_DEFAULT ||
2031             context->proc_subset != PROC_SUBSET_ALL)
2032                 return true;
2033
2034         if (context->root_directory) {
2035                 if (exec_context_get_effective_mount_apivfs(context))
2036                         return true;
2037
2038                 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2039                         if (params && !params->prefix[t])
2040                                 continue;
2041
2042                         if (!strv_isempty(context->directories[t].paths))
2043                                 return true;
2044                 }
2045         }
2046
2047         if (context->dynamic_user &&
2048             (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
2049              !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
2050              !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
2051                 return true;
2052
2053         if (context->log_namespace)
2054                 return true;
2055
2056         return false;
2057 }
2058
2059 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
2060         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2061         _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
2062         _cleanup_close_ int unshare_ready_fd = -1;
2063         _cleanup_(sigkill_waitp) pid_t pid = 0;
2064         uint64_t c = 1;
2065         ssize_t n;
2066         int r;
2067
2068         /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2069          * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2070          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2071          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2072          * which waits for the parent to create the new user namespace while staying in the original namespace. The
2073          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2074          * continues execution normally.
2075          * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2076          * does not need CAP_SETUID to write the single line mapping to itself. */
2077
2078         /* Can only set up multiple mappings with CAP_SETUID. */
2079         if (have_effective_cap(CAP_SETUID) && uid != ouid && uid_is_valid(uid))
2080                 r = asprintf(&uid_map,
2081                              UID_FMT " " UID_FMT " 1\n"     /* Map $OUID → $OUID */
2082                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
2083                              ouid, ouid, uid, uid);
2084         else
2085                 r = asprintf(&uid_map,
2086                              UID_FMT " " UID_FMT " 1\n",    /* Map $OUID → $OUID */
2087                              ouid, ouid);
2088
2089         if (r < 0)
2090                 return -ENOMEM;
2091
2092         /* Can only set up multiple mappings with CAP_SETGID. */
2093         if (have_effective_cap(CAP_SETGID) && gid != ogid && gid_is_valid(gid))
2094                 r = asprintf(&gid_map,
2095                              GID_FMT " " GID_FMT " 1\n"     /* Map $OGID → $OGID */
2096                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
2097                              ogid, ogid, gid, gid);
2098         else
2099                 r = asprintf(&gid_map,
2100                              GID_FMT " " GID_FMT " 1\n",    /* Map $OGID -> $OGID */
2101                              ogid, ogid);
2102
2103         if (r < 0)
2104                 return -ENOMEM;
2105
2106         /* Create a communication channel so that the parent can tell the child when it finished creating the user
2107          * namespace. */
2108         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2109         if (unshare_ready_fd < 0)
2110                 return -errno;
2111
2112         /* Create a communication channel so that the child can tell the parent a proper error code in case it
2113          * failed. */
2114         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2115                 return -errno;
2116
2117         r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2118         if (r < 0)
2119                 return r;
2120         if (r == 0) {
2121                 _cleanup_close_ int fd = -1;
2122                 const char *a;
2123                 pid_t ppid;
2124
2125                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2126                  * here, after the parent opened its own user namespace. */
2127
2128                 ppid = getppid();
2129                 errno_pipe[0] = safe_close(errno_pipe[0]);
2130
2131                 /* Wait until the parent unshared the user namespace */
2132                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2133                         r = -errno;
2134                         goto child_fail;
2135                 }
2136
2137                 /* Disable the setgroups() system call in the child user namespace, for good. */
2138                 a = procfs_file_alloca(ppid, "setgroups");
2139                 fd = open(a, O_WRONLY|O_CLOEXEC);
2140                 if (fd < 0) {
2141                         if (errno != ENOENT) {
2142                                 r = -errno;
2143                                 goto child_fail;
2144                         }
2145
2146                         /* If the file is missing the kernel is too old, let's continue anyway. */
2147                 } else {
2148                         if (write(fd, "deny\n", 5) < 0) {
2149                                 r = -errno;
2150                                 goto child_fail;
2151                         }
2152
2153                         fd = safe_close(fd);
2154                 }
2155
2156                 /* First write the GID map */
2157                 a = procfs_file_alloca(ppid, "gid_map");
2158                 fd = open(a, O_WRONLY|O_CLOEXEC);
2159                 if (fd < 0) {
2160                         r = -errno;
2161                         goto child_fail;
2162                 }
2163                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2164                         r = -errno;
2165                         goto child_fail;
2166                 }
2167                 fd = safe_close(fd);
2168
2169                 /* The write the UID map */
2170                 a = procfs_file_alloca(ppid, "uid_map");
2171                 fd = open(a, O_WRONLY|O_CLOEXEC);
2172                 if (fd < 0) {
2173                         r = -errno;
2174                         goto child_fail;
2175                 }
2176                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2177                         r = -errno;
2178                         goto child_fail;
2179                 }
2180
2181                 _exit(EXIT_SUCCESS);
2182
2183         child_fail:
2184                 (void) write(errno_pipe[1], &r, sizeof(r));
2185                 _exit(EXIT_FAILURE);
2186         }
2187
2188         errno_pipe[1] = safe_close(errno_pipe[1]);
2189
2190         if (unshare(CLONE_NEWUSER) < 0)
2191                 return -errno;
2192
2193         /* Let the child know that the namespace is ready now */
2194         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2195                 return -errno;
2196
2197         /* Try to read an error code from the child */
2198         n = read(errno_pipe[0], &r, sizeof(r));
2199         if (n < 0)
2200                 return -errno;
2201         if (n == sizeof(r)) { /* an error code was sent to us */
2202                 if (r < 0)
2203                         return r;
2204                 return -EIO;
2205         }
2206         if (n != 0) /* on success we should have read 0 bytes */
2207                 return -EIO;
2208
2209         r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
2210         pid = 0;
2211         if (r < 0)
2212                 return r;
2213         if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2214                 return -EIO;
2215
2216         return 0;
2217 }
2218
2219 static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2220         if (!context->dynamic_user)
2221                 return false;
2222
2223         if (type == EXEC_DIRECTORY_CONFIGURATION)
2224                 return false;
2225
2226         if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2227                 return false;
2228
2229         return true;
2230 }
2231
2232 static int setup_exec_directory(
2233                 const ExecContext *context,
2234                 const ExecParameters *params,
2235                 uid_t uid,
2236                 gid_t gid,
2237                 ExecDirectoryType type,
2238                 int *exit_status) {
2239
2240         static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2241                 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2242                 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2243                 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2244                 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2245                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2246         };
2247         char **rt;
2248         int r;
2249
2250         assert(context);
2251         assert(params);
2252         assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2253         assert(exit_status);
2254
2255         if (!params->prefix[type])
2256                 return 0;
2257
2258         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2259                 if (!uid_is_valid(uid))
2260                         uid = 0;
2261                 if (!gid_is_valid(gid))
2262                         gid = 0;
2263         }
2264
2265         STRV_FOREACH(rt, context->directories[type].paths) {
2266                 _cleanup_free_ char *p = NULL, *pp = NULL;
2267
2268                 p = path_join(params->prefix[type], *rt);
2269                 if (!p) {
2270                         r = -ENOMEM;
2271                         goto fail;
2272                 }
2273
2274                 r = mkdir_parents_label(p, 0755);
2275                 if (r < 0)
2276                         goto fail;
2277
2278                 if (exec_directory_is_private(context, type)) {
2279                         _cleanup_free_ char *private_root = NULL;
2280
2281                         /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2282                          * case we want to avoid leaving a directory around fully accessible that is owned by
2283                          * a dynamic user whose UID is later on reused. To lock this down we use the same
2284                          * trick used by container managers to prohibit host users to get access to files of
2285                          * the same UID in containers: we place everything inside a directory that has an
2286                          * access mode of 0700 and is owned root:root, so that it acts as security boundary
2287                          * for unprivileged host code. We then use fs namespacing to make this directory
2288                          * permeable for the service itself.
2289                          *
2290                          * Specifically: for a service which wants a special directory "foo/" we first create
2291                          * a directory "private/" with access mode 0700 owned by root:root. Then we place
2292                          * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2293                          * "private/foo". This way, privileged host users can access "foo/" as usual, but
2294                          * unprivileged host users can't look into it. Inside of the namespace of the unit
2295                          * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2296                          * "private/foo/" is mounted under the same name, thus disabling the access boundary
2297                          * for the service and making sure it only gets access to the dirs it needs but no
2298                          * others. Tricky? Yes, absolutely, but it works!
2299                          *
2300                          * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2301                          * to be owned by the service itself.
2302                          *
2303                          * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2304                          * for sharing files or sockets with other services. */
2305
2306                         private_root = path_join(params->prefix[type], "private");
2307                         if (!private_root) {
2308                                 r = -ENOMEM;
2309                                 goto fail;
2310                         }
2311
2312                         /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2313                         r = mkdir_safe_label(private_root, 0700, 0, 0, MKDIR_WARN_MODE);
2314                         if (r < 0)
2315                                 goto fail;
2316
2317                         pp = path_join(private_root, *rt);
2318                         if (!pp) {
2319                                 r = -ENOMEM;
2320                                 goto fail;
2321                         }
2322
2323                         /* Create all directories between the configured directory and this private root, and mark them 0755 */
2324                         r = mkdir_parents_label(pp, 0755);
2325                         if (r < 0)
2326                                 goto fail;
2327
2328                         if (is_dir(p, false) > 0 &&
2329                             (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2330
2331                                 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2332                                  * it over. Most likely the service has been upgraded from one that didn't use
2333                                  * DynamicUser=1, to one that does. */
2334
2335                                 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2336                                          "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2337                                          exec_directory_type_to_string(type), p, pp);
2338
2339                                 if (rename(p, pp) < 0) {
2340                                         r = -errno;
2341                                         goto fail;
2342                                 }
2343                         } else {
2344                                 /* Otherwise, create the actual directory for the service */
2345
2346                                 r = mkdir_label(pp, context->directories[type].mode);
2347                                 if (r < 0 && r != -EEXIST)
2348                                         goto fail;
2349                         }
2350
2351                         /* And link it up from the original place */
2352                         r = symlink_idempotent(pp, p, true);
2353                         if (r < 0)
2354                                 goto fail;
2355
2356                 } else {
2357                         _cleanup_free_ char *target = NULL;
2358
2359                         if (type != EXEC_DIRECTORY_CONFIGURATION &&
2360                             readlink_and_make_absolute(p, &target) >= 0) {
2361                                 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2362
2363                                 /* This already exists and is a symlink? Interesting. Maybe it's one created
2364                                  * by DynamicUser=1 (see above)?
2365                                  *
2366                                  * We do this for all directory types except for ConfigurationDirectory=,
2367                                  * since they all support the private/ symlink logic at least in some
2368                                  * configurations, see above. */
2369
2370                                 r = chase_symlinks(target, NULL, 0, &target_resolved, NULL);
2371                                 if (r < 0)
2372                                         goto fail;
2373
2374                                 q = path_join(params->prefix[type], "private", *rt);
2375                                 if (!q) {
2376                                         r = -ENOMEM;
2377                                         goto fail;
2378                                 }
2379
2380                                 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2381                                 r = chase_symlinks(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2382                                 if (r < 0)
2383                                         goto fail;
2384
2385                                 if (path_equal(q_resolved, target_resolved)) {
2386
2387                                         /* Hmm, apparently DynamicUser= was once turned on for this service,
2388                                          * but is no longer. Let's move the directory back up. */
2389
2390                                         log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2391                                                  "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2392                                                  exec_directory_type_to_string(type), q, p);
2393
2394                                         if (unlink(p) < 0) {
2395                                                 r = -errno;
2396                                                 goto fail;
2397                                         }
2398
2399                                         if (rename(q, p) < 0) {
2400                                                 r = -errno;
2401                                                 goto fail;
2402                                         }
2403                                 }
2404                         }
2405
2406                         r = mkdir_label(p, context->directories[type].mode);
2407                         if (r < 0) {
2408                                 if (r != -EEXIST)
2409                                         goto fail;
2410
2411                                 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2412                                         struct stat st;
2413
2414                                         /* Don't change the owner/access mode of the configuration directory,
2415                                          * as in the common case it is not written to by a service, and shall
2416                                          * not be writable. */
2417
2418                                         if (stat(p, &st) < 0) {
2419                                                 r = -errno;
2420                                                 goto fail;
2421                                         }
2422
2423                                         /* Still complain if the access mode doesn't match */
2424                                         if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2425                                                 log_warning("%s \'%s\' already exists but the mode is different. "
2426                                                             "(File system: %o %sMode: %o)",
2427                                                             exec_directory_type_to_string(type), *rt,
2428                                                             st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2429
2430                                         continue;
2431                                 }
2432                         }
2433                 }
2434
2435                 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2436                  * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2437                  * current UID/GID ownership.) */
2438                 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2439                 if (r < 0)
2440                         goto fail;
2441
2442                 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2443                  * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2444                  * assignments to exist.*/
2445                 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
2446                 if (r < 0)
2447                         goto fail;
2448         }
2449
2450         return 0;
2451
2452 fail:
2453         *exit_status = exit_status_table[type];
2454         return r;
2455 }
2456
2457 static int write_credential(
2458                 int dfd,
2459                 const char *id,
2460                 const void *data,
2461                 size_t size,
2462                 uid_t uid,
2463                 bool ownership_ok) {
2464
2465         _cleanup_(unlink_and_freep) char *tmp = NULL;
2466         _cleanup_close_ int fd = -1;
2467         int r;
2468
2469         r = tempfn_random_child("", "cred", &tmp);
2470         if (r < 0)
2471                 return r;
2472
2473         fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
2474         if (fd < 0) {
2475                 tmp = mfree(tmp);
2476                 return -errno;
2477         }
2478
2479         r = loop_write(fd, data, size, /* do_pool = */ false);
2480         if (r < 0)
2481                 return r;
2482
2483         if (fchmod(fd, 0400) < 0) /* Take away "w" bit */
2484                 return -errno;
2485
2486         if (uid_is_valid(uid) && uid != getuid()) {
2487                 r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
2488                 if (r < 0) {
2489                         if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2490                                 return r;
2491
2492                         if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want
2493                                             * to express: that the user gets read access and nothing
2494                                             * else. But if the backing fs can't support that (e.g. ramfs)
2495                                             * then we can use file ownership instead. But that's only safe if
2496                                             * we can then re-mount the whole thing read-only, so that the
2497                                             * user can no longer chmod() the file to gain write access. */
2498                                 return r;
2499
2500                         if (fchown(fd, uid, (gid_t) -1) < 0)
2501                                 return -errno;
2502                 }
2503         }
2504
2505         if (renameat(dfd, tmp, dfd, id) < 0)
2506                 return -errno;
2507
2508         tmp = mfree(tmp);
2509         return 0;
2510 }
2511
2512 #define CREDENTIALS_BYTES_MAX (1024LU * 1024LU) /* Refuse to pass more than 1M, after all this is unswappable memory */
2513
2514 static int acquire_credentials(
2515                 const ExecContext *context,
2516                 const ExecParameters *params,
2517                 const char *unit,
2518                 const char *p,
2519                 uid_t uid,
2520                 bool ownership_ok) {
2521
2522         uint64_t left = CREDENTIALS_BYTES_MAX;
2523         _cleanup_close_ int dfd = -1;
2524         ExecSetCredential *sc;
2525         char **id, **fn;
2526         int r;
2527
2528         assert(context);
2529         assert(p);
2530
2531         dfd = open(p, O_DIRECTORY|O_CLOEXEC);
2532         if (dfd < 0)
2533                 return -errno;
2534
2535         /* First we use the literally specified credentials. Note that they might be overridden again below,
2536          * and thus act as a "default" if the same credential is specified multiple times */
2537         HASHMAP_FOREACH(sc, context->set_credentials) {
2538                 size_t add;
2539
2540                 add = strlen(sc->id) + sc->size;
2541                 if (add > left)
2542                         return -E2BIG;
2543
2544                 r = write_credential(dfd, sc->id, sc->data, sc->size, uid, ownership_ok);
2545                 if (r < 0)
2546                         return r;
2547
2548                 left -= add;
2549         }
2550
2551         /* Then, load credential off disk (or acquire via AF_UNIX socket) */
2552         STRV_FOREACH_PAIR(id, fn, context->load_credentials) {
2553                 ReadFullFileFlags flags = READ_FULL_FILE_SECURE;
2554                 _cleanup_(erase_and_freep) char *data = NULL;
2555                 _cleanup_free_ char *j = NULL, *bindname = NULL;
2556                 const char *source;
2557                 size_t size, add;
2558
2559                 if (path_is_absolute(*fn)) {
2560                         /* If this is an absolute path, read the data directly from it, and support AF_UNIX sockets */
2561                         source = *fn;
2562                         flags |= READ_FULL_FILE_CONNECT_SOCKET;
2563
2564                         /* Pass some minimal info about the unit and the credential name we are looking to acquire
2565                          * via the source socket address in case we read off an AF_UNIX socket. */
2566                         if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, *id) < 0)
2567                                 return -ENOMEM;
2568
2569                 } else if (params->received_credentials) {
2570                         /* If this is a relative path, take it relative to the credentials we received
2571                          * ourselves. We don't support the AF_UNIX stuff in this mode, since we are operating
2572                          * on a credential store, i.e. this is guaranteed to be regular files. */
2573                         j = path_join(params->received_credentials, *fn);
2574                         if (!j)
2575                                 return -ENOMEM;
2576
2577                         source = j;
2578                 } else
2579                         source = NULL;
2580
2581
2582                 if (source)
2583                         r = read_full_file_full(AT_FDCWD, source, UINT64_MAX, SIZE_MAX, flags, bindname, &data, &size);
2584                 else
2585                         r = -ENOENT;
2586                 if (r == -ENOENT &&
2587                     faccessat(dfd, *id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0) /* If the source file doesn't exist, but we already acquired the key otherwise, then don't fail */
2588                         continue;
2589                 if (r < 0)
2590                         return r;
2591
2592                 add = strlen(*id) + size;
2593                 if (add > left)
2594                         return -E2BIG;
2595
2596                 r = write_credential(dfd, *id, data, size, uid, ownership_ok);
2597                 if (r < 0)
2598                         return r;
2599
2600                 left -= add;
2601         }
2602
2603         if (fchmod(dfd, 0500) < 0) /* Now take away the "w" bit */
2604                 return -errno;
2605
2606         /* After we created all keys with the right perms, also make sure the credential store as a whole is
2607          * accessible */
2608
2609         if (uid_is_valid(uid) && uid != getuid()) {
2610                 r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE);
2611                 if (r < 0) {
2612                         if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2613                                 return r;
2614
2615                         if (!ownership_ok)
2616                                 return r;
2617
2618                         if (fchown(dfd, uid, (gid_t) -1) < 0)
2619                                 return -errno;
2620                 }
2621         }
2622
2623         return 0;
2624 }
2625
2626 static int setup_credentials_internal(
2627                 const ExecContext *context,
2628                 const ExecParameters *params,
2629                 const char *unit,
2630                 const char *final,        /* This is where the credential store shall eventually end up at */
2631                 const char *workspace,    /* This is where we can prepare it before moving it to the final place */
2632                 bool reuse_workspace,     /* Whether to reuse any existing workspace mount if it already is a mount */
2633                 bool must_mount,          /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
2634                 uid_t uid) {
2635
2636         int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
2637                                    * if we mounted something; false if we definitely can't mount anything */
2638         bool final_mounted;
2639         const char *where;
2640
2641         assert(context);
2642         assert(final);
2643         assert(workspace);
2644
2645         if (reuse_workspace) {
2646                 r = path_is_mount_point(workspace, NULL, 0);
2647                 if (r < 0)
2648                         return r;
2649                 if (r > 0)
2650                         workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse it, let's keep this in mind */
2651                 else
2652                         workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
2653         } else
2654                 workspace_mounted = -1; /* ditto */
2655
2656         r = path_is_mount_point(final, NULL, 0);
2657         if (r < 0)
2658                 return r;
2659         if (r > 0) {
2660                 /* If the final place already has something mounted, we use that. If the workspace also has
2661                  * something mounted we assume it's actually the same mount (but with MS_RDONLY
2662                  * different). */
2663                 final_mounted = true;
2664
2665                 if (workspace_mounted < 0) {
2666                         /* If the final place is mounted, but the workspace we isn't, then let's bind mount
2667                          * the final version to the workspace, and make it writable, so that we can make
2668                          * changes */
2669
2670                         r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
2671                         if (r < 0)
2672                                 return r;
2673
2674                         r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
2675                         if (r < 0)
2676                                 return r;
2677
2678                         workspace_mounted = true;
2679                 }
2680         } else
2681                 final_mounted = false;
2682
2683         if (workspace_mounted < 0) {
2684                 /* Nothing is mounted on the workspace yet, let's try to mount something now */
2685                 for (int try = 0;; try++) {
2686
2687                         if (try == 0) {
2688                                 /* Try "ramfs" first, since it's not swap backed */
2689                                 r = mount_nofollow_verbose(LOG_DEBUG, "ramfs", workspace, "ramfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, "mode=0700");
2690                                 if (r >= 0) {
2691                                         workspace_mounted = true;
2692                                         break;
2693                                 }
2694
2695                         } else if (try == 1) {
2696                                 _cleanup_free_ char *opts = NULL;
2697
2698                                 if (asprintf(&opts, "mode=0700,nr_inodes=1024,size=%lu", CREDENTIALS_BYTES_MAX) < 0)
2699                                         return -ENOMEM;
2700
2701                                 /* Fall back to "tmpfs" otherwise */
2702                                 r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", workspace, "tmpfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, opts);
2703                                 if (r >= 0) {
2704                                         workspace_mounted = true;
2705                                         break;
2706                                 }
2707
2708                         } else {
2709                                 /* If that didn't work, try to make a bind mount from the final to the workspace, so that we can make it writable there. */
2710                                 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
2711                                 if (r < 0) {
2712                                         if (!ERRNO_IS_PRIVILEGE(r)) /* Propagate anything that isn't a permission problem */
2713                                                 return r;
2714
2715                                         if (must_mount) /* If we it's not OK to use the plain directory
2716                                                          * fallback, propagate all errors too */
2717                                                 return r;
2718
2719                                         /* If we lack privileges to bind mount stuff, then let's gracefully
2720                                          * proceed for compat with container envs, and just use the final dir
2721                                          * as is. */
2722
2723                                         workspace_mounted = false;
2724                                         break;
2725                                 }
2726
2727                                 /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
2728                                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
2729                                 if (r < 0)
2730                                         return r;
2731
2732                                 workspace_mounted = true;
2733                                 break;
2734                         }
2735                 }
2736         }
2737
2738         assert(!must_mount || workspace_mounted > 0);
2739         where = workspace_mounted ? workspace : final;
2740
2741         r = acquire_credentials(context, params, unit, where, uid, workspace_mounted);
2742         if (r < 0)
2743                 return r;
2744
2745         if (workspace_mounted) {
2746                 /* Make workspace read-only now, so that any bind mount we make from it defaults to read-only too */
2747                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
2748                 if (r < 0)
2749                         return r;
2750
2751                 /* And mount it to the final place, read-only */
2752                 if (final_mounted)
2753                         r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
2754                 else
2755                         r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
2756                 if (r < 0)
2757                         return r;
2758         } else {
2759                 _cleanup_free_ char *parent = NULL;
2760
2761                 /* If we do not have our own mount put used the plain directory fallback, then we need to
2762                  * open access to the top-level credential directory and the per-service directory now */
2763
2764                 parent = dirname_malloc(final);
2765                 if (!parent)
2766                         return -ENOMEM;
2767                 if (chmod(parent, 0755) < 0)
2768                         return -errno;
2769         }
2770
2771         return 0;
2772 }
2773
2774 static int setup_credentials(
2775                 const ExecContext *context,
2776                 const ExecParameters *params,
2777                 const char *unit,
2778                 uid_t uid) {
2779
2780         _cleanup_free_ char *p = NULL, *q = NULL;
2781         const char *i;
2782         int r;
2783
2784         assert(context);
2785         assert(params);
2786
2787         if (!exec_context_has_credentials(context))
2788                 return 0;
2789
2790         if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
2791                 return -EINVAL;
2792
2793         /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
2794          * and the subdir we mount over with a read-only file system readable by the service's user */
2795         q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials");
2796         if (!q)
2797                 return -ENOMEM;
2798
2799         r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */
2800         if (r < 0 && r != -EEXIST)
2801                 return r;
2802
2803         p = path_join(q, unit);
2804         if (!p)
2805                 return -ENOMEM;
2806
2807         r = mkdir_label(p, 0700); /* per-unit dir: private to user */
2808         if (r < 0 && r != -EEXIST)
2809                 return r;
2810
2811         r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
2812         if (r < 0) {
2813                 _cleanup_free_ char *t = NULL, *u = NULL;
2814
2815                 /* If this is not a privilege or support issue then propagate the error */
2816                 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2817                         return r;
2818
2819                 /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
2820                  * it into place, so that users can't access half-initialized credential stores. */
2821                 t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
2822                 if (!t)
2823                         return -ENOMEM;
2824
2825                 /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
2826                  * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
2827                  * after it is fully set up */
2828                 u = path_join(t, unit);
2829                 if (!u)
2830                         return -ENOMEM;
2831
2832                 FOREACH_STRING(i, t, u) {
2833                         r = mkdir_label(i, 0700);
2834                         if (r < 0 && r != -EEXIST)
2835                                 return r;
2836                 }
2837
2838                 r = setup_credentials_internal(
2839                                 context,
2840                                 params,
2841                                 unit,
2842                                 p,       /* final mount point */
2843                                 u,       /* temporary workspace to overmount */
2844                                 true,    /* reuse the workspace if it is already a mount */
2845                                 false,   /* it's OK to fall back to a plain directory if we can't mount anything */
2846                                 uid);
2847
2848                 (void) rmdir(u); /* remove the workspace again if we can. */
2849
2850                 if (r < 0)
2851                         return r;
2852
2853         } else if (r == 0) {
2854
2855                 /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
2856                  * we can use the same directory for all cases, after turning off propagation. Question
2857                  * though is: where do we turn off propagation exactly, and where do we place the workspace
2858                  * directory? We need some place that is guaranteed to be a mount point in the host, and
2859                  * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
2860                  * since we ultimately want to move the resulting file system there, i.e. we need propagation
2861                  * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
2862                  * would be visible in the host mount table all the time, which we want to avoid. Hence, what
2863                  * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
2864                  * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
2865                  * propagation on the former, and then overmount the latter.
2866                  *
2867                  * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
2868                  * for this purpose, but there are few other candidates that work equally well for us, and
2869                  * given that the we do this in a privately namespaced short-lived single-threaded process
2870                  * that no one else sees this should be OK to do.*/
2871
2872                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL); /* Turn off propagation from our namespace to host */
2873                 if (r < 0)
2874                         goto child_fail;
2875
2876                 r = setup_credentials_internal(
2877                                 context,
2878                                 params,
2879                                 unit,
2880                                 p,           /* final mount point */
2881                                 "/dev/shm",  /* temporary workspace to overmount */
2882                                 false,       /* do not reuse /dev/shm if it is already a mount, under no circumstances */
2883                                 true,        /* insist that something is mounted, do not allow fallback to plain directory */
2884                                 uid);
2885                 if (r < 0)
2886                         goto child_fail;
2887
2888                 _exit(EXIT_SUCCESS);
2889
2890         child_fail:
2891                 _exit(EXIT_FAILURE);
2892         }
2893
2894         return 0;
2895 }
2896
2897 #if ENABLE_SMACK
2898 static int setup_smack(
2899                 const ExecContext *context,
2900                 int executable_fd) {
2901         int r;
2902
2903         assert(context);
2904         assert(executable_fd >= 0);
2905
2906         if (context->smack_process_label) {
2907                 r = mac_smack_apply_pid(0, context->smack_process_label);
2908                 if (r < 0)
2909                         return r;
2910         }
2911 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2912         else {
2913                 _cleanup_free_ char *exec_label = NULL;
2914
2915                 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
2916                 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
2917                         return r;
2918
2919                 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2920                 if (r < 0)
2921                         return r;
2922         }
2923 #endif
2924
2925         return 0;
2926 }
2927 #endif
2928
2929 static int compile_bind_mounts(
2930                 const ExecContext *context,
2931                 const ExecParameters *params,
2932                 BindMount **ret_bind_mounts,
2933                 size_t *ret_n_bind_mounts,
2934                 char ***ret_empty_directories) {
2935
2936         _cleanup_strv_free_ char **empty_directories = NULL;
2937         BindMount *bind_mounts;
2938         size_t n, h = 0;
2939         int r;
2940
2941         assert(context);
2942         assert(params);
2943         assert(ret_bind_mounts);
2944         assert(ret_n_bind_mounts);
2945         assert(ret_empty_directories);
2946
2947         n = context->n_bind_mounts;
2948         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2949                 if (!params->prefix[t])
2950                         continue;
2951
2952                 n += strv_length(context->directories[t].paths);
2953         }
2954
2955         if (n <= 0) {
2956                 *ret_bind_mounts = NULL;
2957                 *ret_n_bind_mounts = 0;
2958                 *ret_empty_directories = NULL;
2959                 return 0;
2960         }
2961
2962         bind_mounts = new(BindMount, n);
2963         if (!bind_mounts)
2964                 return -ENOMEM;
2965
2966         for (size_t i = 0; i < context->n_bind_mounts; i++) {
2967                 BindMount *item = context->bind_mounts + i;
2968                 char *s, *d;
2969
2970                 s = strdup(item->source);
2971                 if (!s) {
2972                         r = -ENOMEM;
2973                         goto finish;
2974                 }
2975
2976                 d = strdup(item->destination);
2977                 if (!d) {
2978                         free(s);
2979                         r = -ENOMEM;
2980                         goto finish;
2981                 }
2982
2983                 bind_mounts[h++] = (BindMount) {
2984                         .source = s,
2985                         .destination = d,
2986                         .read_only = item->read_only,
2987                         .recursive = item->recursive,
2988                         .ignore_enoent = item->ignore_enoent,
2989                 };
2990         }
2991
2992         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2993                 char **suffix;
2994
2995                 if (!params->prefix[t])
2996                         continue;
2997
2998                 if (strv_isempty(context->directories[t].paths))
2999                         continue;
3000
3001                 if (exec_directory_is_private(context, t) &&
3002                     !exec_context_with_rootfs(context)) {
3003                         char *private_root;
3004
3005                         /* So this is for a dynamic user, and we need to make sure the process can access its own
3006                          * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3007                          * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3008
3009                         private_root = path_join(params->prefix[t], "private");
3010                         if (!private_root) {
3011                                 r = -ENOMEM;
3012                                 goto finish;
3013                         }
3014
3015                         r = strv_consume(&empty_directories, private_root);
3016                         if (r < 0)
3017                                 goto finish;
3018                 }
3019
3020                 STRV_FOREACH(suffix, context->directories[t].paths) {
3021                         char *s, *d;
3022
3023                         if (exec_directory_is_private(context, t))
3024                                 s = path_join(params->prefix[t], "private", *suffix);
3025                         else
3026                                 s = path_join(params->prefix[t], *suffix);
3027                         if (!s) {
3028                                 r = -ENOMEM;
3029                                 goto finish;
3030                         }
3031
3032                         if (exec_directory_is_private(context, t) &&
3033                             exec_context_with_rootfs(context))
3034                                 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3035                                  * directory is not created on the root directory. So, let's bind-mount the directory
3036                                  * on the 'non-private' place. */
3037                                 d = path_join(params->prefix[t], *suffix);
3038                         else
3039                                 d = strdup(s);
3040                         if (!d) {
3041                                 free(s);
3042                                 r = -ENOMEM;
3043                                 goto finish;
3044                         }
3045
3046                         bind_mounts[h++] = (BindMount) {
3047                                 .source = s,
3048                                 .destination = d,
3049                                 .read_only = false,
3050                                 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
3051                                 .recursive = true,
3052                                 .ignore_enoent = false,
3053                         };
3054                 }
3055         }
3056
3057         assert(h == n);
3058
3059         *ret_bind_mounts = bind_mounts;
3060         *ret_n_bind_mounts = n;
3061         *ret_empty_directories = TAKE_PTR(empty_directories);
3062
3063         return (int) n;
3064
3065 finish:
3066         bind_mount_free_many(bind_mounts, h);
3067         return r;
3068 }
3069
3070 static bool insist_on_sandboxing(
3071                 const ExecContext *context,
3072                 const char *root_dir,
3073                 const char *root_image,
3074                 const BindMount *bind_mounts,
3075                 size_t n_bind_mounts) {
3076
3077         assert(context);
3078         assert(n_bind_mounts == 0 || bind_mounts);
3079
3080         /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
3081          * would alter the view on the file system beyond making things read-only or invisible, i.e. would
3082          * rearrange stuff in a way we cannot ignore gracefully. */
3083
3084         if (context->n_temporary_filesystems > 0)
3085                 return true;
3086
3087         if (root_dir || root_image)
3088                 return true;
3089
3090         if (context->n_mount_images > 0)
3091                 return true;
3092
3093         if (context->dynamic_user)
3094                 return true;
3095
3096         /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3097          * essential. */
3098         for (size_t i = 0; i < n_bind_mounts; i++)
3099                 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
3100                         return true;
3101
3102         if (context->log_namespace)
3103                 return true;
3104
3105         return false;
3106 }
3107
3108 static int apply_mount_namespace(
3109                 const Unit *u,
3110                 ExecCommandFlags command_flags,
3111                 const ExecContext *context,
3112                 const ExecParameters *params,
3113                 const ExecRuntime *runtime,
3114                 char **error_path) {
3115
3116         _cleanup_strv_free_ char **empty_directories = NULL;
3117         const char *tmp_dir = NULL, *var_tmp_dir = NULL;
3118         const char *root_dir = NULL, *root_image = NULL;
3119         _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL;
3120         NamespaceInfo ns_info;
3121         bool needs_sandboxing;
3122         BindMount *bind_mounts = NULL;
3123         size_t n_bind_mounts = 0;
3124         int r;
3125
3126         assert(context);
3127
3128         if (params->flags & EXEC_APPLY_CHROOT) {
3129                 root_image = context->root_image;
3130
3131                 if (!root_image)
3132                         root_dir = context->root_directory;
3133         }
3134
3135         r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3136         if (r < 0)
3137                 return r;
3138
3139         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3140         if (needs_sandboxing) {
3141                 /* The runtime struct only contains the parent of the private /tmp,
3142                  * which is non-accessible to world users. Inside of it there's a /tmp
3143                  * that is sticky, and that's the one we want to use here.
3144                  * This does not apply when we are using /run/systemd/empty as fallback. */
3145
3146                 if (context->private_tmp && runtime) {
3147                         if (streq_ptr(runtime->tmp_dir, RUN_SYSTEMD_EMPTY))
3148                                 tmp_dir = runtime->tmp_dir;
3149                         else if (runtime->tmp_dir)
3150                                 tmp_dir = strjoina(runtime->tmp_dir, "/tmp");
3151
3152                         if (streq_ptr(runtime->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3153                                 var_tmp_dir = runtime->var_tmp_dir;
3154                         else if (runtime->var_tmp_dir)
3155                                 var_tmp_dir = strjoina(runtime->var_tmp_dir, "/tmp");
3156                 }
3157
3158                 ns_info = (NamespaceInfo) {
3159                         .ignore_protect_paths = false,
3160                         .private_dev = context->private_devices,
3161                         .protect_control_groups = context->protect_control_groups,
3162                         .protect_kernel_tunables = context->protect_kernel_tunables,
3163                         .protect_kernel_modules = context->protect_kernel_modules,
3164                         .protect_kernel_logs = context->protect_kernel_logs,
3165                         .protect_hostname = context->protect_hostname,
3166                         .mount_apivfs = exec_context_get_effective_mount_apivfs(context),
3167                         .private_mounts = context->private_mounts,
3168                         .protect_home = context->protect_home,
3169                         .protect_system = context->protect_system,
3170                         .protect_proc = context->protect_proc,
3171                         .proc_subset = context->proc_subset,
3172                 };
3173         } else if (!context->dynamic_user && root_dir)
3174                 /*
3175                  * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
3176                  * sandbox info, otherwise enforce it, don't ignore protected paths and
3177                  * fail if we are enable to apply the sandbox inside the mount namespace.
3178                  */
3179                 ns_info = (NamespaceInfo) {
3180                         .ignore_protect_paths = true,
3181                 };
3182         else
3183                 ns_info = (NamespaceInfo) {};
3184
3185         if (context->mount_flags == MS_SHARED)
3186                 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3187
3188         if (exec_context_has_credentials(context) &&
3189             params->prefix[EXEC_DIRECTORY_RUNTIME] &&
3190             FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
3191                 creds_path = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials", u->id);
3192                 if (!creds_path) {
3193                         r = -ENOMEM;
3194                         goto finalize;
3195                 }
3196         }
3197
3198         if (MANAGER_IS_SYSTEM(u->manager)) {
3199                 propagate_dir = path_join("/run/systemd/propagate/", u->id);
3200                 if (!propagate_dir)
3201                         return -ENOMEM;
3202                 incoming_dir = strdup("/run/systemd/incoming");
3203                 if (!incoming_dir)
3204                         return -ENOMEM;
3205         }
3206
3207         r = setup_namespace(root_dir, root_image, context->root_image_options,
3208                             &ns_info, context->read_write_paths,
3209                             needs_sandboxing ? context->read_only_paths : NULL,
3210                             needs_sandboxing ? context->inaccessible_paths : NULL,
3211                             needs_sandboxing ? context->exec_paths : NULL,
3212                             needs_sandboxing ? context->no_exec_paths : NULL,
3213                             empty_directories,
3214                             bind_mounts,
3215                             n_bind_mounts,
3216                             context->temporary_filesystems,
3217                             context->n_temporary_filesystems,
3218                             context->mount_images,
3219                             context->n_mount_images,
3220                             tmp_dir,
3221                             var_tmp_dir,
3222                             creds_path,
3223                             context->log_namespace,
3224                             context->mount_flags,
3225                             context->root_hash, context->root_hash_size, context->root_hash_path,
3226                             context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3227                             context->root_verity,
3228                             propagate_dir,
3229                             incoming_dir,
3230                             root_dir || root_image ? params->notify_socket : NULL,
3231                             DISSECT_IMAGE_DISCARD_ON_LOOP|DISSECT_IMAGE_RELAX_VAR_CHECK|DISSECT_IMAGE_FSCK,
3232                             error_path);
3233
3234         /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
3235          * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
3236          * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3237          * completely different execution environment. */
3238         if (r == -ENOANO) {
3239                 if (insist_on_sandboxing(
3240                                     context,
3241                                     root_dir, root_image,
3242                                     bind_mounts,
3243                                     n_bind_mounts)) {
3244                         log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
3245                                        "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3246                                        n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
3247
3248                         r = -EOPNOTSUPP;
3249                 } else {
3250                         log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
3251                         r = 0;
3252                 }
3253         }
3254
3255 finalize:
3256         bind_mount_free_many(bind_mounts, n_bind_mounts);
3257         return r;
3258 }
3259
3260 static int apply_working_directory(
3261                 const ExecContext *context,
3262                 const ExecParameters *params,
3263                 const char *home,
3264                 int *exit_status) {
3265
3266         const char *d, *wd;
3267
3268         assert(context);
3269         assert(exit_status);
3270
3271         if (context->working_directory_home) {
3272
3273                 if (!home) {
3274                         *exit_status = EXIT_CHDIR;
3275                         return -ENXIO;
3276                 }
3277
3278                 wd = home;
3279
3280         } else
3281                 wd = empty_to_root(context->working_directory);
3282
3283         if (params->flags & EXEC_APPLY_CHROOT)
3284                 d = wd;
3285         else
3286                 d = prefix_roota(context->root_directory, wd);
3287
3288         if (chdir(d) < 0 && !context->working_directory_missing_ok) {
3289                 *exit_status = EXIT_CHDIR;
3290                 return -errno;
3291         }
3292
3293         return 0;
3294 }
3295
3296 static int apply_root_directory(
3297                 const ExecContext *context,
3298                 const ExecParameters *params,
3299                 const bool needs_mount_ns,
3300                 int *exit_status) {
3301
3302         assert(context);
3303         assert(exit_status);
3304
3305         if (params->flags & EXEC_APPLY_CHROOT)
3306                 if (!needs_mount_ns && context->root_directory)
3307                         if (chroot(context->root_directory) < 0) {
3308                                 *exit_status = EXIT_CHROOT;
3309                                 return -errno;
3310                         }
3311
3312         return 0;
3313 }
3314
3315 static int setup_keyring(
3316                 const Unit *u,
3317                 const ExecContext *context,
3318                 const ExecParameters *p,
3319                 uid_t uid, gid_t gid) {
3320
3321         key_serial_t keyring;
3322         int r = 0;
3323         uid_t saved_uid;
3324         gid_t saved_gid;
3325
3326         assert(u);
3327         assert(context);
3328         assert(p);
3329
3330         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3331          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3332          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3333          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3334          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3335          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3336
3337         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3338                 return 0;
3339
3340         /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3341          * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3342          * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3343          * & group is just as nasty as acquiring a reference to the user keyring. */
3344
3345         saved_uid = getuid();
3346         saved_gid = getgid();
3347
3348         if (gid_is_valid(gid) && gid != saved_gid) {
3349                 if (setregid(gid, -1) < 0)
3350                         return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
3351         }
3352
3353         if (uid_is_valid(uid) && uid != saved_uid) {
3354                 if (setreuid(uid, -1) < 0) {
3355                         r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
3356                         goto out;
3357                 }
3358         }
3359
3360         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3361         if (keyring == -1) {
3362                 if (errno == ENOSYS)
3363                         log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
3364                 else if (ERRNO_IS_PRIVILEGE(errno))
3365                         log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
3366                 else if (errno == EDQUOT)
3367                         log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
3368                 else
3369                         r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
3370
3371                 goto out;
3372         }
3373
3374         /* When requested link the user keyring into the session keyring. */
3375         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3376
3377                 if (keyctl(KEYCTL_LINK,
3378                            KEY_SPEC_USER_KEYRING,
3379                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3380                         r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
3381                         goto out;
3382                 }
3383         }
3384
3385         /* Restore uid/gid back */
3386         if (uid_is_valid(uid) && uid != saved_uid) {
3387                 if (setreuid(saved_uid, -1) < 0) {
3388                         r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
3389                         goto out;
3390                 }
3391         }
3392
3393         if (gid_is_valid(gid) && gid != saved_gid) {
3394                 if (setregid(saved_gid, -1) < 0)
3395                         return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
3396         }
3397
3398         /* Populate they keyring with the invocation ID by default, as original saved_uid. */
3399         if (!sd_id128_is_null(u->invocation_id)) {
3400                 key_serial_t key;
3401
3402                 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
3403                 if (key == -1)
3404                         log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
3405                 else {
3406                         if (keyctl(KEYCTL_SETPERM, key,
3407                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3408                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
3409                                 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
3410                 }
3411         }
3412
3413 out:
3414         /* Revert back uid & gid for the last time, and exit */
3415         /* no extra logging, as only the first already reported error matters */
3416         if (getuid() != saved_uid)
3417                 (void) setreuid(saved_uid, -1);
3418
3419         if (getgid() != saved_gid)
3420                 (void) setregid(saved_gid, -1);
3421
3422         return r;
3423 }
3424
3425 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
3426         assert(array);
3427         assert(n);
3428         assert(pair);
3429
3430         if (pair[0] >= 0)
3431                 array[(*n)++] = pair[0];
3432         if (pair[1] >= 0)
3433                 array[(*n)++] = pair[1];
3434 }
3435
3436 static int close_remaining_fds(
3437                 const ExecParameters *params,
3438                 const ExecRuntime *runtime,
3439                 const DynamicCreds *dcreds,
3440                 int user_lookup_fd,
3441                 int socket_fd,
3442                 const int *fds, size_t n_fds) {
3443
3444         size_t n_dont_close = 0;
3445         int dont_close[n_fds + 12];
3446
3447         assert(params);
3448
3449         if (params->stdin_fd >= 0)
3450                 dont_close[n_dont_close++] = params->stdin_fd;
3451         if (params->stdout_fd >= 0)
3452                 dont_close[n_dont_close++] = params->stdout_fd;
3453         if (params->stderr_fd >= 0)
3454                 dont_close[n_dont_close++] = params->stderr_fd;
3455
3456         if (socket_fd >= 0)
3457                 dont_close[n_dont_close++] = socket_fd;
3458         if (n_fds > 0) {
3459                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3460                 n_dont_close += n_fds;
3461         }
3462
3463         if (runtime)
3464                 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
3465
3466         if (dcreds) {
3467                 if (dcreds->user)
3468                         append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
3469                 if (dcreds->group)
3470                         append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
3471         }
3472
3473         if (user_lookup_fd >= 0)
3474                 dont_close[n_dont_close++] = user_lookup_fd;
3475
3476         return close_all_fds(dont_close, n_dont_close);
3477 }
3478
3479 static int send_user_lookup(
3480                 Unit *unit,
3481                 int user_lookup_fd,
3482                 uid_t uid,
3483                 gid_t gid) {
3484
3485         assert(unit);
3486
3487         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3488          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3489          * specified. */
3490
3491         if (user_lookup_fd < 0)
3492                 return 0;
3493
3494         if (!uid_is_valid(uid) && !gid_is_valid(gid))
3495                 return 0;
3496
3497         if (writev(user_lookup_fd,
3498                (struct iovec[]) {
3499                            IOVEC_INIT(&uid, sizeof(uid)),
3500                            IOVEC_INIT(&gid, sizeof(gid)),
3501                            IOVEC_INIT_STRING(unit->id) }, 3) < 0)
3502                 return -errno;
3503
3504         return 0;
3505 }
3506
3507 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3508         int r;
3509
3510         assert(c);
3511         assert(home);
3512         assert(buf);
3513
3514         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3515
3516         if (*home)
3517                 return 0;
3518
3519         if (!c->working_directory_home)
3520                 return 0;
3521
3522         r = get_home_dir(buf);
3523         if (r < 0)
3524                 return r;
3525
3526         *home = *buf;
3527         return 1;
3528 }
3529
3530 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3531         _cleanup_strv_free_ char ** list = NULL;
3532         int r;
3533
3534         assert(c);
3535         assert(p);
3536         assert(ret);
3537
3538         assert(c->dynamic_user);
3539
3540         /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3541          * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3542          * directories. */
3543
3544         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3545                 char **i;
3546
3547                 if (t == EXEC_DIRECTORY_CONFIGURATION)
3548                         continue;
3549
3550                 if (!p->prefix[t])
3551                         continue;
3552
3553                 STRV_FOREACH(i, c->directories[t].paths) {
3554                         char *e;
3555
3556                         if (exec_directory_is_private(c, t))
3557                                 e = path_join(p->prefix[t], "private", *i);
3558                         else
3559                                 e = path_join(p->prefix[t], *i);
3560                         if (!e)
3561                                 return -ENOMEM;
3562
3563                         r = strv_consume(&list, e);
3564                         if (r < 0)
3565                                 return r;
3566                 }
3567         }
3568
3569         *ret = TAKE_PTR(list);
3570
3571         return 0;
3572 }
3573
3574 static char *exec_command_line(char **argv);
3575
3576 static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
3577         bool using_subcgroup;
3578         char *p;
3579
3580         assert(params);
3581         assert(ret);
3582
3583         if (!params->cgroup_path)
3584                 return -EINVAL;
3585
3586         /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
3587          * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
3588          * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
3589          * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
3590          * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
3591          * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
3592          * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
3593          * flag, which is only passed for the former statements, not for the latter. */
3594
3595         using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
3596         if (using_subcgroup)
3597                 p = path_join(params->cgroup_path, ".control");
3598         else
3599                 p = strdup(params->cgroup_path);
3600         if (!p)
3601                 return -ENOMEM;
3602
3603         *ret = p;
3604         return using_subcgroup;
3605 }
3606
3607 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
3608         _cleanup_(cpu_set_reset) CPUSet s = {};
3609         int r;
3610
3611         assert(c);
3612         assert(ret);
3613
3614         if (!c->numa_policy.nodes.set) {
3615                 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3616                 return 0;
3617         }
3618
3619         r = numa_to_cpu_set(&c->numa_policy, &s);
3620         if (r < 0)
3621                 return r;
3622
3623         cpu_set_reset(ret);
3624
3625         return cpu_set_add_all(ret, &s);
3626 }
3627
3628 bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
3629         assert(c);
3630
3631         return c->cpu_affinity_from_numa;
3632 }
3633
3634 static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
3635         int r;
3636
3637         assert(fds);
3638         assert(n_fds);
3639         assert(*n_fds < fds_size);
3640         assert(ret_fd);
3641
3642         if (fd < 0) {
3643                 *ret_fd = -1;
3644                 return 0;
3645         }
3646
3647         if (fd < 3 + (int) *n_fds) {
3648                 /* Let's move the fd up, so that it's outside of the fd range we will use to store
3649                  * the fds we pass to the process (or which are closed only during execve). */
3650
3651                 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
3652                 if (r < 0)
3653                         return -errno;
3654
3655                 CLOSE_AND_REPLACE(fd, r);
3656         }
3657
3658         *ret_fd = fds[*n_fds] = fd;
3659         (*n_fds) ++;
3660         return 1;
3661 }
3662
3663 static int exec_child(
3664                 Unit *unit,
3665                 const ExecCommand *command,
3666                 const ExecContext *context,
3667                 const ExecParameters *params,
3668                 ExecRuntime *runtime,
3669                 DynamicCreds *dcreds,
3670                 int socket_fd,
3671                 const int named_iofds[static 3],
3672                 int *fds,
3673                 size_t n_socket_fds,
3674                 size_t n_storage_fds,
3675                 char **files_env,
3676                 int user_lookup_fd,
3677                 int *exit_status) {
3678
3679         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **replaced_argv = NULL;
3680         int r, ngids = 0, exec_fd;
3681         _cleanup_free_ gid_t *supplementary_gids = NULL;
3682         const char *username = NULL, *groupname = NULL;
3683         _cleanup_free_ char *home_buffer = NULL;
3684         const char *home = NULL, *shell = NULL;
3685         char **final_argv = NULL;
3686         dev_t journal_stream_dev = 0;
3687         ino_t journal_stream_ino = 0;
3688         bool userns_set_up = false;
3689         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
3690                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
3691                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
3692                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
3693 #if HAVE_SELINUX
3694         _cleanup_free_ char *mac_selinux_context_net = NULL;
3695         bool use_selinux = false;
3696 #endif
3697 #if ENABLE_SMACK
3698         bool use_smack = false;
3699 #endif
3700 #if HAVE_APPARMOR
3701         bool use_apparmor = false;
3702 #endif
3703         uid_t saved_uid = getuid();
3704         gid_t saved_gid = getgid();
3705         uid_t uid = UID_INVALID;
3706         gid_t gid = GID_INVALID;
3707         size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
3708                n_keep_fds; /* total number of fds not to close */
3709         int secure_bits;
3710         _cleanup_free_ gid_t *gids_after_pam = NULL;
3711         int ngids_after_pam = 0;
3712
3713         assert(unit);
3714         assert(command);
3715         assert(context);
3716         assert(params);
3717         assert(exit_status);
3718
3719         rename_process_from_path(command->path);
3720
3721         /* We reset exactly these signals, since they are the
3722          * only ones we set to SIG_IGN in the main daemon. All
3723          * others we leave untouched because we set them to
3724          * SIG_DFL or a valid handler initially, both of which
3725          * will be demoted to SIG_DFL. */
3726         (void) default_signals(SIGNALS_CRASH_HANDLER,
3727                                SIGNALS_IGNORE, -1);
3728
3729         if (context->ignore_sigpipe)
3730                 (void) ignore_signals(SIGPIPE, -1);
3731
3732         r = reset_signal_mask();
3733         if (r < 0) {
3734                 *exit_status = EXIT_SIGNAL_MASK;
3735                 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
3736         }
3737
3738         if (params->idle_pipe)
3739                 do_idle_pipe_dance(params->idle_pipe);
3740
3741         /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
3742          * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
3743          * any fds open we don't really want open during the transition. In order to make logging work, we switch the
3744          * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
3745
3746         log_forget_fds();
3747         log_set_open_when_needed(true);
3748
3749         /* In case anything used libc syslog(), close this here, too */
3750         closelog();
3751
3752         int keep_fds[n_fds + 2];
3753         memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
3754         n_keep_fds = n_fds;
3755
3756         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
3757         if (r < 0) {
3758                 *exit_status = EXIT_FDS;
3759                 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
3760         }
3761
3762         r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
3763         if (r < 0) {
3764                 *exit_status = EXIT_FDS;
3765                 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
3766         }
3767
3768         if (!context->same_pgrp &&
3769             setsid() < 0) {
3770                 *exit_status = EXIT_SETSID;
3771                 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
3772         }
3773
3774         exec_context_tty_reset(context, params);
3775
3776         if (unit_shall_confirm_spawn(unit)) {
3777                 const char *vc = params->confirm_spawn;
3778                 _cleanup_free_ char *cmdline = NULL;
3779
3780                 cmdline = exec_command_line(command->argv);
3781                 if (!cmdline) {
3782                         *exit_status = EXIT_MEMORY;
3783                         return log_oom();
3784                 }
3785
3786                 r = ask_for_confirmation(vc, unit, cmdline);
3787                 if (r != CONFIRM_EXECUTE) {
3788                         if (r == CONFIRM_PRETEND_SUCCESS) {
3789                                 *exit_status = EXIT_SUCCESS;
3790                                 return 0;
3791                         }
3792                         *exit_status = EXIT_CONFIRM;
3793                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
3794                                                     "Execution cancelled by the user");
3795                 }
3796         }
3797
3798         /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
3799          * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
3800          * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
3801          * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
3802          * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
3803         if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
3804             setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
3805                 *exit_status = EXIT_MEMORY;
3806                 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3807         }
3808
3809         if (context->dynamic_user && dcreds) {
3810                 _cleanup_strv_free_ char **suggested_paths = NULL;
3811
3812                 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
3813                  * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here.*/
3814                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
3815                         *exit_status = EXIT_USER;
3816                         return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3817                 }
3818
3819                 r = compile_suggested_paths(context, params, &suggested_paths);
3820                 if (r < 0) {
3821                         *exit_status = EXIT_MEMORY;
3822                         return log_oom();
3823                 }
3824
3825                 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
3826                 if (r < 0) {
3827                         *exit_status = EXIT_USER;
3828                         if (r == -EILSEQ)
3829                                 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
3830                                                             "Failed to update dynamic user credentials: User or group with specified name already exists.");
3831                         return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
3832                 }
3833
3834                 if (!uid_is_valid(uid)) {
3835                         *exit_status = EXIT_USER;
3836                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
3837                 }
3838
3839                 if (!gid_is_valid(gid)) {
3840                         *exit_status = EXIT_USER;
3841                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
3842                 }
3843
3844                 if (dcreds->user)
3845                         username = dcreds->user->name;
3846
3847         } else {
3848                 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
3849                 if (r < 0) {
3850                         *exit_status = EXIT_USER;
3851                         return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
3852                 }
3853
3854                 r = get_fixed_group(context, &groupname, &gid);
3855                 if (r < 0) {
3856                         *exit_status = EXIT_GROUP;
3857                         return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
3858                 }
3859         }
3860
3861         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
3862         r = get_supplementary_groups(context, username, groupname, gid,
3863                                      &supplementary_gids, &ngids);
3864         if (r < 0) {
3865                 *exit_status = EXIT_GROUP;
3866                 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
3867         }
3868
3869         r = send_user_lookup(unit, user_lookup_fd, uid, gid);
3870         if (r < 0) {
3871                 *exit_status = EXIT_USER;
3872                 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
3873         }
3874
3875         user_lookup_fd = safe_close(user_lookup_fd);
3876
3877         r = acquire_home(context, uid, &home, &home_buffer);
3878         if (r < 0) {
3879                 *exit_status = EXIT_CHDIR;
3880                 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
3881         }
3882
3883         /* If a socket is connected to STDIN/STDOUT/STDERR, we
3884          * must sure to drop O_NONBLOCK */
3885         if (socket_fd >= 0)
3886                 (void) fd_nonblock(socket_fd, false);
3887
3888         /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
3889          * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
3890         if (params->cgroup_path) {
3891                 _cleanup_free_ char *p = NULL;
3892
3893                 r = exec_parameters_get_cgroup_path(params, &p);
3894                 if (r < 0) {
3895                         *exit_status = EXIT_CGROUP;
3896                         return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
3897                 }
3898
3899                 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
3900                 if (r < 0) {
3901                         *exit_status = EXIT_CGROUP;
3902                         return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
3903                 }
3904         }
3905
3906         if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
3907                 r = open_netns_path(runtime->netns_storage_socket, context->network_namespace_path);
3908                 if (r < 0) {
3909                         *exit_status = EXIT_NETWORK;
3910                         return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
3911                 }
3912         }
3913
3914         r = setup_input(context, params, socket_fd, named_iofds);
3915         if (r < 0) {
3916                 *exit_status = EXIT_STDIN;
3917                 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
3918         }
3919
3920         r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3921         if (r < 0) {
3922                 *exit_status = EXIT_STDOUT;
3923                 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
3924         }
3925
3926         r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3927         if (r < 0) {
3928                 *exit_status = EXIT_STDERR;
3929                 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
3930         }
3931
3932         if (context->oom_score_adjust_set) {
3933                 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
3934                  * prohibit write access to this file, and we shouldn't trip up over that. */
3935                 r = set_oom_score_adjust(context->oom_score_adjust);
3936                 if (ERRNO_IS_PRIVILEGE(r))
3937                         log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
3938                 else if (r < 0) {
3939                         *exit_status = EXIT_OOM_ADJUST;
3940                         return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
3941                 }
3942         }
3943
3944         if (context->coredump_filter_set) {
3945                 r = set_coredump_filter(context->coredump_filter);
3946                 if (ERRNO_IS_PRIVILEGE(r))
3947                         log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
3948                 else if (r < 0)
3949                         return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
3950         }
3951
3952         if (context->nice_set) {
3953                 r = setpriority_closest(context->nice);
3954                 if (r < 0)
3955                         return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
3956         }
3957
3958         if (context->cpu_sched_set) {
3959                 struct sched_param param = {
3960                         .sched_priority = context->cpu_sched_priority,
3961                 };
3962
3963                 r = sched_setscheduler(0,
3964                                        context->cpu_sched_policy |
3965                                        (context->cpu_sched_reset_on_fork ?
3966                                         SCHED_RESET_ON_FORK : 0),
3967                                        &param);
3968                 if (r < 0) {
3969                         *exit_status = EXIT_SETSCHEDULER;
3970                         return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
3971                 }
3972         }
3973
3974         if (context->cpu_affinity_from_numa || context->cpu_set.set) {
3975                 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
3976                 const CPUSet *cpu_set;
3977
3978                 if (context->cpu_affinity_from_numa) {
3979                         r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
3980                         if (r < 0) {
3981                                 *exit_status = EXIT_CPUAFFINITY;
3982                                 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
3983                         }
3984
3985                         cpu_set = &converted_cpu_set;
3986                 } else
3987                         cpu_set = &context->cpu_set;
3988
3989                 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
3990                         *exit_status = EXIT_CPUAFFINITY;
3991                         return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
3992                 }
3993         }
3994
3995         if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
3996                 r = apply_numa_policy(&context->numa_policy);
3997                 if (r == -EOPNOTSUPP)
3998                         log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
3999                 else if (r < 0) {
4000                         *exit_status = EXIT_NUMA_POLICY;
4001                         return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
4002                 }
4003         }
4004
4005         if (context->ioprio_set)
4006                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
4007                         *exit_status = EXIT_IOPRIO;
4008                         return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
4009                 }
4010
4011         if (context->timer_slack_nsec != NSEC_INFINITY)
4012                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
4013                         *exit_status = EXIT_TIMERSLACK;
4014                         return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
4015                 }
4016
4017         if (context->personality != PERSONALITY_INVALID) {
4018                 r = safe_personality(context->personality);
4019                 if (r < 0) {
4020                         *exit_status = EXIT_PERSONALITY;
4021                         return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
4022                 }
4023         }
4024
4025         if (context->utmp_id)
4026                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
4027                                       context->tty_path,
4028                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
4029                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4030                                       USER_PROCESS,
4031                                       username);
4032
4033         if (uid_is_valid(uid)) {
4034                 r = chown_terminal(STDIN_FILENO, uid);
4035                 if (r < 0) {
4036                         *exit_status = EXIT_STDIN;
4037                         return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
4038                 }
4039         }
4040
4041         /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
4042          * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4043          * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
4044          * touch a single hierarchy too. */
4045         if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
4046                 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
4047                 if (r < 0) {
4048                         *exit_status = EXIT_CGROUP;
4049                         return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
4050                 }
4051         }
4052
4053         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4054                 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
4055                 if (r < 0)
4056                         return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
4057         }
4058
4059         if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4060                 r = setup_credentials(context, params, unit->id, uid);
4061                 if (r < 0) {
4062                         *exit_status = EXIT_CREDENTIALS;
4063                         return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
4064                 }
4065         }
4066
4067         r = build_environment(
4068                         unit,
4069                         context,
4070                         params,
4071                         n_fds,
4072                         home,
4073                         username,
4074                         shell,
4075                         journal_stream_dev,
4076                         journal_stream_ino,
4077                         &our_env);
4078         if (r < 0) {
4079                 *exit_status = EXIT_MEMORY;
4080                 return log_oom();
4081         }
4082
4083         r = build_pass_environment(context, &pass_env);
4084         if (r < 0) {
4085                 *exit_status = EXIT_MEMORY;
4086                 return log_oom();
4087         }
4088
4089         accum_env = strv_env_merge(5,
4090                                    params->environment,
4091                                    our_env,
4092                                    pass_env,
4093                                    context->environment,
4094                                    files_env);
4095         if (!accum_env) {
4096                 *exit_status = EXIT_MEMORY;
4097                 return log_oom();
4098         }
4099         accum_env = strv_env_clean(accum_env);
4100
4101         (void) umask(context->umask);
4102
4103         r = setup_keyring(unit, context, params, uid, gid);
4104         if (r < 0) {
4105                 *exit_status = EXIT_KEYRING;
4106                 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
4107         }
4108
4109         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
4110         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
4111
4112         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
4113         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
4114
4115         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
4116         if (needs_ambient_hack)
4117                 needs_setuid = false;
4118         else
4119                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4120
4121         if (needs_sandboxing) {
4122                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
4123                  * present. The actual MAC context application will happen later, as late as possible, to avoid
4124                  * impacting our own code paths. */
4125
4126 #if HAVE_SELINUX
4127                 use_selinux = mac_selinux_use();
4128 #endif
4129 #if ENABLE_SMACK
4130                 use_smack = mac_smack_use();
4131 #endif
4132 #if HAVE_APPARMOR
4133                 use_apparmor = mac_apparmor_use();
4134 #endif
4135         }
4136
4137         if (needs_sandboxing) {
4138                 int which_failed;
4139
4140                 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4141                  * is set here. (See below.) */
4142
4143                 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4144                 if (r < 0) {
4145                         *exit_status = EXIT_LIMITS;
4146                         return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4147                 }
4148         }
4149
4150         if (needs_setuid && context->pam_name && username) {
4151                 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
4152                  * wins here. (See above.) */
4153
4154                 /* All fds passed in the fds array will be closed in the pam child process. */
4155                 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
4156                 if (r < 0) {
4157                         *exit_status = EXIT_PAM;
4158                         return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
4159                 }
4160
4161                 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4162                 if (ngids_after_pam < 0) {
4163                         *exit_status = EXIT_MEMORY;
4164                         return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
4165                 }
4166         }
4167
4168         if (needs_sandboxing && context->private_users && !have_effective_cap(CAP_SYS_ADMIN)) {
4169                 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4170                  * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4171                  * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
4172
4173                 userns_set_up = true;
4174                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4175                 if (r < 0) {
4176                         *exit_status = EXIT_USER;
4177                         return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
4178                 }
4179         }
4180
4181         if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
4182
4183                 if (ns_type_supported(NAMESPACE_NET)) {
4184                         r = setup_netns(runtime->netns_storage_socket);
4185                         if (r == -EPERM)
4186                                 log_unit_warning_errno(unit, r,
4187                                                        "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
4188                         else if (r < 0) {
4189                                 *exit_status = EXIT_NETWORK;
4190                                 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
4191                         }
4192                 } else if (context->network_namespace_path) {
4193                         *exit_status = EXIT_NETWORK;
4194                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4195                                                     "NetworkNamespacePath= is not supported, refusing.");
4196                 } else
4197                         log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
4198         }
4199
4200         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
4201         if (needs_mount_namespace) {
4202                 _cleanup_free_ char *error_path = NULL;
4203
4204                 r = apply_mount_namespace(unit, command->flags, context, params, runtime, &error_path);
4205                 if (r < 0) {
4206                         *exit_status = EXIT_NAMESPACE;
4207                         return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
4208                                                     error_path ? ": " : "", strempty(error_path));
4209                 }
4210         }
4211
4212         if (needs_sandboxing) {
4213                 r = apply_protect_hostname(unit, context, exit_status);
4214                 if (r < 0)
4215                         return r;
4216         }
4217
4218         /* Drop groups as early as possible.
4219          * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
4220          * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
4221         if (needs_setuid) {
4222                 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4223                 int ngids_to_enforce = 0;
4224
4225                 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4226                                                    ngids,
4227                                                    gids_after_pam,
4228                                                    ngids_after_pam,
4229                                                    &gids_to_enforce);
4230                 if (ngids_to_enforce < 0) {
4231                         *exit_status = EXIT_MEMORY;
4232                         return log_unit_error_errno(unit,
4233                                                     ngids_to_enforce,
4234                                                     "Failed to merge group lists. Group membership might be incorrect: %m");
4235                 }
4236
4237                 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
4238                 if (r < 0) {
4239                         *exit_status = EXIT_GROUP;
4240                         return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
4241                 }
4242         }
4243
4244         /* If the user namespace was not set up above, try to do it now.
4245          * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4246          * restricted by rules pertaining to combining user namspaces with other namespaces (e.g. in the
4247          * case of mount namespaces being less privileged when the mount point list is copied from a
4248          * different user namespace). */
4249
4250         if (needs_sandboxing && context->private_users && !userns_set_up) {
4251                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4252                 if (r < 0) {
4253                         *exit_status = EXIT_USER;
4254                         return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
4255                 }
4256         }
4257
4258         /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4259          * shall execute. */
4260
4261         _cleanup_free_ char *executable = NULL;
4262         _cleanup_close_ int executable_fd = -1;
4263         r = find_executable_full(command->path, false, &executable, &executable_fd);
4264         if (r < 0) {
4265                 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
4266                         log_struct_errno(LOG_INFO, r,
4267                                          "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4268                                          LOG_UNIT_ID(unit),
4269                                          LOG_UNIT_INVOCATION_ID(unit),
4270                                          LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
4271                                                           command->path),
4272                                          "EXECUTABLE=%s", command->path);
4273                         return 0;
4274                 }
4275
4276                 *exit_status = EXIT_EXEC;
4277                 return log_struct_errno(LOG_INFO, r,
4278                                         "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4279                                         LOG_UNIT_ID(unit),
4280                                         LOG_UNIT_INVOCATION_ID(unit),
4281                                         LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
4282                                                          command->path),
4283                                         "EXECUTABLE=%s", command->path);
4284         }
4285
4286         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
4287         if (r < 0) {
4288                 *exit_status = EXIT_FDS;
4289                 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4290         }
4291
4292 #if HAVE_SELINUX
4293         if (needs_sandboxing && use_selinux && params->selinux_context_net && socket_fd >= 0) {
4294                 r = mac_selinux_get_child_mls_label(socket_fd, executable, context->selinux_context, &mac_selinux_context_net);
4295                 if (r < 0) {
4296                         *exit_status = EXIT_SELINUX_CONTEXT;
4297                         return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
4298                 }
4299         }
4300 #endif
4301
4302         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
4303          * more aggressive this time since socket_fd and the netns fds we don't need anymore. We do keep the exec_fd
4304          * however if we have it as we want to keep it open until the final execve(). */
4305
4306         r = close_all_fds(keep_fds, n_keep_fds);
4307         if (r >= 0)
4308                 r = shift_fds(fds, n_fds);
4309         if (r >= 0)
4310                 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
4311         if (r < 0) {
4312                 *exit_status = EXIT_FDS;
4313                 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
4314         }
4315
4316         /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4317          * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4318          * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4319          * came this far. */
4320
4321         secure_bits = context->secure_bits;
4322
4323         if (needs_sandboxing) {
4324                 uint64_t bset;
4325
4326                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
4327                  * requested. (Note this is placed after the general resource limit initialization, see
4328                  * above, in order to take precedence.) */
4329                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
4330                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
4331                                 *exit_status = EXIT_LIMITS;
4332                                 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
4333                         }
4334                 }
4335
4336 #if ENABLE_SMACK
4337                 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4338                  * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4339                 if (use_smack) {
4340                         r = setup_smack(context, executable_fd);
4341                         if (r < 0) {
4342                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
4343                                 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
4344                         }
4345                 }
4346 #endif
4347
4348                 bset = context->capability_bounding_set;
4349                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4350                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4351                  * instead of us doing that */
4352                 if (needs_ambient_hack)
4353                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
4354                                 (UINT64_C(1) << CAP_SETUID) |
4355                                 (UINT64_C(1) << CAP_SETGID);
4356
4357                 if (!cap_test_all(bset)) {
4358                         r = capability_bounding_set_drop(bset, false);
4359                         if (r < 0) {
4360                                 *exit_status = EXIT_CAPABILITIES;
4361                                 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
4362                         }
4363                 }
4364
4365                 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4366                  * keep-caps set.
4367                  * To be able to raise the ambient capabilities after setresuid() they have to be
4368                  * added to the inherited set and keep caps has to be set (done in enforce_user()).
4369                  * After setresuid() the ambient capabilities can be raised as they are present in
4370                  * the permitted and inhertiable set. However it is possible that someone wants to
4371                  * set ambient capabilities without changing the user, so we also set the ambient
4372                  * capabilities here.
4373                  * The requested ambient capabilities are raised in the inheritable set if the
4374                  * second argument is true. */
4375                 if (!needs_ambient_hack) {
4376                         r = capability_ambient_set_apply(context->capability_ambient_set, true);
4377                         if (r < 0) {
4378                                 *exit_status = EXIT_CAPABILITIES;
4379                                 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
4380                         }
4381                 }
4382         }
4383
4384         /* chroot to root directory first, before we lose the ability to chroot */
4385         r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
4386         if (r < 0)
4387                 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
4388
4389         if (needs_setuid) {
4390                 if (uid_is_valid(uid)) {
4391                         r = enforce_user(context, uid);
4392                         if (r < 0) {
4393                                 *exit_status = EXIT_USER;
4394                                 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
4395                         }
4396
4397                         if (!needs_ambient_hack &&
4398                             context->capability_ambient_set != 0) {
4399
4400                                 /* Raise the ambient capabilities after user change. */
4401                                 r = capability_ambient_set_apply(context->capability_ambient_set, false);
4402                                 if (r < 0) {
4403                                         *exit_status = EXIT_CAPABILITIES;
4404                                         return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
4405                                 }
4406                         }
4407                 }
4408         }
4409
4410         /* Apply working directory here, because the working directory might be on NFS and only the user running
4411          * this service might have the correct privilege to change to the working directory */
4412         r = apply_working_directory(context, params, home, exit_status);
4413         if (r < 0)
4414                 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
4415
4416         if (needs_sandboxing) {
4417                 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
4418                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
4419                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
4420                  * are restricted. */
4421
4422 #if HAVE_SELINUX
4423                 if (use_selinux) {
4424                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
4425
4426                         if (exec_context) {
4427                                 r = setexeccon(exec_context);
4428                                 if (r < 0) {
4429                                         *exit_status = EXIT_SELINUX_CONTEXT;
4430                                         return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
4431                                 }
4432                         }
4433                 }
4434 #endif
4435
4436 #if HAVE_APPARMOR
4437                 if (use_apparmor && context->apparmor_profile) {
4438                         r = aa_change_onexec(context->apparmor_profile);
4439                         if (r < 0 && !context->apparmor_profile_ignore) {
4440                                 *exit_status = EXIT_APPARMOR_PROFILE;
4441                                 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
4442                         }
4443                 }
4444 #endif
4445
4446                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
4447                  * we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits requires
4448                  * CAP_SETPCAP. */
4449                 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
4450                         /* CAP_SETPCAP is required to set securebits. This capability is raised into the
4451                          * effective set here.
4452                          * The effective set is overwritten during execve  with the following  values:
4453                          * - ambient set (for non-root processes)
4454                          * - (inheritable | bounding) set for root processes)
4455                          *
4456                          * Hence there is no security impact to raise it in the effective set before execve
4457                          */
4458                         r = capability_gain_cap_setpcap(NULL);
4459                         if (r < 0) {
4460                                 *exit_status = EXIT_CAPABILITIES;
4461                                 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
4462                         }
4463                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
4464                                 *exit_status = EXIT_SECUREBITS;
4465                                 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
4466                         }
4467                 }
4468
4469                 if (context_has_no_new_privileges(context))
4470                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
4471                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
4472                                 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
4473                         }
4474
4475 #if HAVE_SECCOMP
4476                 r = apply_address_families(unit, context);
4477                 if (r < 0) {
4478                         *exit_status = EXIT_ADDRESS_FAMILIES;
4479                         return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
4480                 }
4481
4482                 r = apply_memory_deny_write_execute(unit, context);
4483                 if (r < 0) {
4484                         *exit_status = EXIT_SECCOMP;
4485                         return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
4486                 }
4487
4488                 r = apply_restrict_realtime(unit, context);
4489                 if (r < 0) {
4490                         *exit_status = EXIT_SECCOMP;
4491                         return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
4492                 }
4493
4494                 r = apply_restrict_suid_sgid(unit, context);
4495                 if (r < 0) {
4496                         *exit_status = EXIT_SECCOMP;
4497                         return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
4498                 }
4499
4500                 r = apply_restrict_namespaces(unit, context);
4501                 if (r < 0) {
4502                         *exit_status = EXIT_SECCOMP;
4503                         return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
4504                 }
4505
4506                 r = apply_protect_sysctl(unit, context);
4507                 if (r < 0) {
4508                         *exit_status = EXIT_SECCOMP;
4509                         return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
4510                 }
4511
4512                 r = apply_protect_kernel_modules(unit, context);
4513                 if (r < 0) {
4514                         *exit_status = EXIT_SECCOMP;
4515                         return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
4516                 }
4517
4518                 r = apply_protect_kernel_logs(unit, context);
4519                 if (r < 0) {
4520                         *exit_status = EXIT_SECCOMP;
4521                         return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
4522                 }
4523
4524                 r = apply_protect_clock(unit, context);
4525                 if (r < 0) {
4526                         *exit_status = EXIT_SECCOMP;
4527                         return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
4528                 }
4529
4530                 r = apply_private_devices(unit, context);
4531                 if (r < 0) {
4532                         *exit_status = EXIT_SECCOMP;
4533                         return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
4534                 }
4535
4536                 r = apply_syscall_archs(unit, context);
4537                 if (r < 0) {
4538                         *exit_status = EXIT_SECCOMP;
4539                         return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
4540                 }
4541
4542                 r = apply_lock_personality(unit, context);
4543                 if (r < 0) {
4544                         *exit_status = EXIT_SECCOMP;
4545                         return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
4546                 }
4547
4548                 r = apply_syscall_log(unit, context);
4549                 if (r < 0) {
4550                         *exit_status = EXIT_SECCOMP;
4551                         return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
4552                 }
4553
4554                 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
4555                  * by the filter as little as possible. */
4556                 r = apply_syscall_filter(unit, context, needs_ambient_hack);
4557                 if (r < 0) {
4558                         *exit_status = EXIT_SECCOMP;
4559                         return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
4560                 }
4561 #endif
4562         }
4563
4564         if (!strv_isempty(context->unset_environment)) {
4565                 char **ee = NULL;
4566
4567                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
4568                 if (!ee) {
4569                         *exit_status = EXIT_MEMORY;
4570                         return log_oom();
4571                 }
4572
4573                 strv_free_and_replace(accum_env, ee);
4574         }
4575
4576         if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
4577                 replaced_argv = replace_env_argv(command->argv, accum_env);
4578                 if (!replaced_argv) {
4579                         *exit_status = EXIT_MEMORY;
4580                         return log_oom();
4581                 }
4582                 final_argv = replaced_argv;
4583         } else
4584                 final_argv = command->argv;
4585
4586         if (DEBUG_LOGGING) {
4587                 _cleanup_free_ char *line;
4588
4589                 line = exec_command_line(final_argv);
4590                 if (line)
4591                         log_struct(LOG_DEBUG,
4592                                    "EXECUTABLE=%s", executable,
4593                                    LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
4594                                    LOG_UNIT_ID(unit),
4595                                    LOG_UNIT_INVOCATION_ID(unit));
4596         }
4597
4598         if (exec_fd >= 0) {
4599                 uint8_t hot = 1;
4600
4601                 /* We have finished with all our initializations. Let's now let the manager know that. From this point
4602                  * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
4603
4604                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
4605                         *exit_status = EXIT_EXEC;
4606                         return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
4607                 }
4608         }
4609
4610         r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
4611
4612         if (exec_fd >= 0) {
4613                 uint8_t hot = 0;
4614
4615                 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
4616                  * that POLLHUP on it no longer means execve() succeeded. */
4617
4618                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
4619                         *exit_status = EXIT_EXEC;
4620                         return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
4621                 }
4622         }
4623
4624         *exit_status = EXIT_EXEC;
4625         return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
4626 }
4627
4628 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
4629 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
4630
4631 int exec_spawn(Unit *unit,
4632                ExecCommand *command,
4633                const ExecContext *context,
4634                const ExecParameters *params,
4635                ExecRuntime *runtime,
4636                DynamicCreds *dcreds,
4637                pid_t *ret) {
4638
4639         int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
4640         _cleanup_free_ char *subcgroup_path = NULL;
4641         _cleanup_strv_free_ char **files_env = NULL;
4642         size_t n_storage_fds = 0, n_socket_fds = 0;
4643         _cleanup_free_ char *line = NULL;
4644         pid_t pid;
4645
4646         assert(unit);
4647         assert(command);
4648         assert(context);
4649         assert(ret);
4650         assert(params);
4651         assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
4652
4653         if (context->std_input == EXEC_INPUT_SOCKET ||
4654             context->std_output == EXEC_OUTPUT_SOCKET ||
4655             context->std_error == EXEC_OUTPUT_SOCKET) {
4656
4657                 if (params->n_socket_fds > 1)
4658                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
4659
4660                 if (params->n_socket_fds == 0)
4661                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
4662
4663                 socket_fd = params->fds[0];
4664         } else {
4665                 socket_fd = -1;
4666                 fds = params->fds;
4667                 n_socket_fds = params->n_socket_fds;
4668                 n_storage_fds = params->n_storage_fds;
4669         }
4670
4671         r = exec_context_named_iofds(context, params, named_iofds);
4672         if (r < 0)
4673                 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
4674
4675         r = exec_context_load_environment(unit, context, &files_env);
4676         if (r < 0)
4677                 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
4678
4679         line = exec_command_line(command->argv);
4680         if (!line)
4681                 return log_oom();
4682
4683         /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
4684            and, until the next SELinux policy changes, we save further reloads in future children. */
4685         mac_selinux_maybe_reload();
4686
4687         log_struct(LOG_DEBUG,
4688                    LOG_UNIT_MESSAGE(unit, "About to execute %s", line),
4689                    "EXECUTABLE=%s", command->path, /* We won't know the real executable path until we create
4690                                                       the mount namespace in the child, but we want to log
4691                                                       from the parent, so we need to use the (possibly
4692                                                       inaccurate) path here. */
4693                    LOG_UNIT_ID(unit),
4694                    LOG_UNIT_INVOCATION_ID(unit));
4695
4696         if (params->cgroup_path) {
4697                 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
4698                 if (r < 0)
4699                         return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
4700                 if (r > 0) { /* We are using a child cgroup */
4701                         r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
4702                         if (r < 0)
4703                                 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
4704                 }
4705         }
4706
4707         pid = fork();
4708         if (pid < 0)
4709                 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
4710
4711         if (pid == 0) {
4712                 int exit_status = EXIT_SUCCESS;
4713
4714                 r = exec_child(unit,
4715                                command,
4716                                context,
4717                                params,
4718                                runtime,
4719                                dcreds,
4720                                socket_fd,
4721                                named_iofds,
4722                                fds,
4723                                n_socket_fds,
4724                                n_storage_fds,
4725                                files_env,
4726                                unit->manager->user_lookup_fds[1],
4727                                &exit_status);
4728
4729                 if (r < 0) {
4730                         const char *status =
4731                                 exit_status_to_string(exit_status,
4732                                                       EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
4733
4734                         log_struct_errno(LOG_ERR, r,
4735                                          "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4736                                          LOG_UNIT_ID(unit),
4737                                          LOG_UNIT_INVOCATION_ID(unit),
4738                                          LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
4739                                                           status, command->path),
4740                                          "EXECUTABLE=%s", command->path);
4741                 }
4742
4743                 _exit(exit_status);
4744         }
4745
4746         log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
4747
4748         /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
4749          * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
4750          * process will be killed too). */
4751         if (subcgroup_path)
4752                 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
4753
4754         exec_status_start(&command->exec_status, pid);
4755
4756         *ret = pid;
4757         return 0;
4758 }
4759
4760 void exec_context_init(ExecContext *c) {
4761         assert(c);
4762
4763         c->umask = 0022;
4764         c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
4765         c->cpu_sched_policy = SCHED_OTHER;
4766         c->syslog_priority = LOG_DAEMON|LOG_INFO;
4767         c->syslog_level_prefix = true;
4768         c->ignore_sigpipe = true;
4769         c->timer_slack_nsec = NSEC_INFINITY;
4770         c->personality = PERSONALITY_INVALID;
4771         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
4772                 c->directories[t].mode = 0755;
4773         c->timeout_clean_usec = USEC_INFINITY;
4774         c->capability_bounding_set = CAP_ALL;
4775         assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
4776         c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
4777         c->log_level_max = -1;
4778 #if HAVE_SECCOMP
4779         c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL;
4780 #endif
4781         numa_policy_reset(&c->numa_policy);
4782 }
4783
4784 void exec_context_done(ExecContext *c) {
4785         assert(c);
4786
4787         c->environment = strv_free(c->environment);
4788         c->environment_files = strv_free(c->environment_files);
4789         c->pass_environment = strv_free(c->pass_environment);
4790         c->unset_environment = strv_free(c->unset_environment);
4791
4792         rlimit_free_all(c->rlimit);
4793
4794         for (size_t l = 0; l < 3; l++) {
4795                 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
4796                 c->stdio_file[l] = mfree(c->stdio_file[l]);
4797         }
4798
4799         c->working_directory = mfree(c->working_directory);
4800         c->root_directory = mfree(c->root_directory);
4801         c->root_image = mfree(c->root_image);
4802         c->root_image_options = mount_options_free_all(c->root_image_options);
4803         c->root_hash = mfree(c->root_hash);
4804         c->root_hash_size = 0;
4805         c->root_hash_path = mfree(c->root_hash_path);
4806         c->root_hash_sig = mfree(c->root_hash_sig);
4807         c->root_hash_sig_size = 0;
4808         c->root_hash_sig_path = mfree(c->root_hash_sig_path);
4809         c->root_verity = mfree(c->root_verity);
4810         c->tty_path = mfree(c->tty_path);
4811         c->syslog_identifier = mfree(c->syslog_identifier);
4812         c->user = mfree(c->user);
4813         c->group = mfree(c->group);
4814
4815         c->supplementary_groups = strv_free(c->supplementary_groups);
4816
4817         c->pam_name = mfree(c->pam_name);
4818
4819         c->read_only_paths = strv_free(c->read_only_paths);
4820         c->read_write_paths = strv_free(c->read_write_paths);
4821         c->inaccessible_paths = strv_free(c->inaccessible_paths);
4822         c->exec_paths = strv_free(c->exec_paths);
4823         c->no_exec_paths = strv_free(c->no_exec_paths);
4824
4825         bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
4826         c->bind_mounts = NULL;
4827         c->n_bind_mounts = 0;
4828         temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
4829         c->temporary_filesystems = NULL;
4830         c->n_temporary_filesystems = 0;
4831         c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
4832
4833         cpu_set_reset(&c->cpu_set);
4834         numa_policy_reset(&c->numa_policy);
4835
4836         c->utmp_id = mfree(c->utmp_id);
4837         c->selinux_context = mfree(c->selinux_context);
4838         c->apparmor_profile = mfree(c->apparmor_profile);
4839         c->smack_process_label = mfree(c->smack_process_label);
4840
4841         c->syscall_filter = hashmap_free(c->syscall_filter);
4842         c->syscall_archs = set_free(c->syscall_archs);
4843         c->address_families = set_free(c->address_families);
4844
4845         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
4846                 c->directories[t].paths = strv_free(c->directories[t].paths);
4847
4848         c->log_level_max = -1;
4849
4850         exec_context_free_log_extra_fields(c);
4851
4852         c->log_ratelimit_interval_usec = 0;
4853         c->log_ratelimit_burst = 0;
4854
4855         c->stdin_data = mfree(c->stdin_data);
4856         c->stdin_data_size = 0;
4857
4858         c->network_namespace_path = mfree(c->network_namespace_path);
4859
4860         c->log_namespace = mfree(c->log_namespace);
4861
4862         c->load_credentials = strv_free(c->load_credentials);
4863         c->set_credentials = hashmap_free(c->set_credentials);
4864 }
4865
4866 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
4867         char **i;
4868
4869         assert(c);
4870
4871         if (!runtime_prefix)
4872                 return 0;
4873
4874         STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
4875                 _cleanup_free_ char *p;
4876
4877                 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
4878                         p = path_join(runtime_prefix, "private", *i);
4879                 else
4880                         p = path_join(runtime_prefix, *i);
4881                 if (!p)
4882                         return -ENOMEM;
4883
4884                 /* We execute this synchronously, since we need to be sure this is gone when we start the
4885                  * service next. */
4886                 (void) rm_rf(p, REMOVE_ROOT);
4887         }
4888
4889         return 0;
4890 }
4891
4892 int exec_context_destroy_credentials(const ExecContext *c, const char *runtime_prefix, const char *unit) {
4893         _cleanup_free_ char *p = NULL;
4894
4895         assert(c);
4896
4897         if (!runtime_prefix || !unit)
4898                 return 0;
4899
4900         p = path_join(runtime_prefix, "credentials", unit);
4901         if (!p)
4902                 return -ENOMEM;
4903
4904         /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
4905          * unmount it, and afterwards remove the mount point */
4906         (void) umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW);
4907         (void) rm_rf(p, REMOVE_ROOT|REMOVE_CHMOD);
4908
4909         return 0;
4910 }
4911
4912 static void exec_command_done(ExecCommand *c) {
4913         assert(c);
4914
4915         c->path = mfree(c->path);
4916         c->argv = strv_free(c->argv);
4917 }
4918
4919 void exec_command_done_array(ExecCommand *c, size_t n) {
4920         for (size_t i = 0; i < n; i++)
4921                 exec_command_done(c+i);
4922 }
4923
4924 ExecCommand* exec_command_free_list(ExecCommand *c) {
4925         ExecCommand *i;
4926
4927         while ((i = c)) {
4928                 LIST_REMOVE(command, c, i);
4929                 exec_command_done(i);
4930                 free(i);
4931         }
4932
4933         return NULL;
4934 }
4935
4936 void exec_command_free_array(ExecCommand **c, size_t n) {
4937         for (size_t i = 0; i < n; i++)
4938                 c[i] = exec_command_free_list(c[i]);
4939 }
4940
4941 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
4942         for (size_t i = 0; i < n; i++)
4943                 exec_status_reset(&c[i].exec_status);
4944 }
4945
4946 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
4947         for (size_t i = 0; i < n; i++) {
4948                 ExecCommand *z;
4949
4950                 LIST_FOREACH(command, z, c[i])
4951                         exec_status_reset(&z->exec_status);
4952         }
4953 }
4954
4955 typedef struct InvalidEnvInfo {
4956         const Unit *unit;
4957         const char *path;
4958 } InvalidEnvInfo;
4959
4960 static void invalid_env(const char *p, void *userdata) {
4961         InvalidEnvInfo *info = userdata;
4962
4963         log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
4964 }
4965
4966 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
4967         assert(c);
4968
4969         switch (fd_index) {
4970
4971         case STDIN_FILENO:
4972                 if (c->std_input != EXEC_INPUT_NAMED_FD)
4973                         return NULL;
4974
4975                 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
4976
4977         case STDOUT_FILENO:
4978                 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
4979                         return NULL;
4980
4981                 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
4982
4983         case STDERR_FILENO:
4984                 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
4985                         return NULL;
4986
4987                 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
4988
4989         default:
4990                 return NULL;
4991         }
4992 }
4993
4994 static int exec_context_named_iofds(
4995                 const ExecContext *c,
4996                 const ExecParameters *p,
4997                 int named_iofds[static 3]) {
4998
4999         size_t targets;
5000         const char* stdio_fdname[3];
5001         size_t n_fds;
5002
5003         assert(c);
5004         assert(p);
5005         assert(named_iofds);
5006
5007         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
5008                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
5009                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
5010
5011         for (size_t i = 0; i < 3; i++)
5012                 stdio_fdname[i] = exec_context_fdname(c, i);
5013
5014         n_fds = p->n_storage_fds + p->n_socket_fds;
5015
5016         for (size_t i = 0; i < n_fds  && targets > 0; i++)
5017                 if (named_iofds[STDIN_FILENO] < 0 &&
5018                     c->std_input == EXEC_INPUT_NAMED_FD &&
5019                     stdio_fdname[STDIN_FILENO] &&
5020                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
5021
5022                         named_iofds[STDIN_FILENO] = p->fds[i];
5023                         targets--;
5024
5025                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
5026                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
5027                            stdio_fdname[STDOUT_FILENO] &&
5028                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
5029
5030                         named_iofds[STDOUT_FILENO] = p->fds[i];
5031                         targets--;
5032
5033                 } else if (named_iofds[STDERR_FILENO] < 0 &&
5034                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
5035                            stdio_fdname[STDERR_FILENO] &&
5036                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
5037
5038                         named_iofds[STDERR_FILENO] = p->fds[i];
5039                         targets--;
5040                 }
5041
5042         return targets == 0 ? 0 : -ENOENT;
5043 }
5044
5045 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
5046         char **i, **r = NULL;
5047
5048         assert(c);
5049         assert(l);
5050
5051         STRV_FOREACH(i, c->environment_files) {
5052                 char *fn;
5053                 int k;
5054                 bool ignore = false;
5055                 char **p;
5056                 _cleanup_globfree_ glob_t pglob = {};
5057
5058                 fn = *i;
5059
5060                 if (fn[0] == '-') {
5061                         ignore = true;
5062                         fn++;
5063                 }
5064
5065                 if (!path_is_absolute(fn)) {
5066                         if (ignore)
5067                                 continue;
5068
5069                         strv_free(r);
5070                         return -EINVAL;
5071                 }
5072
5073                 /* Filename supports globbing, take all matching files */
5074                 k = safe_glob(fn, 0, &pglob);
5075                 if (k < 0) {
5076                         if (ignore)
5077                                 continue;
5078
5079                         strv_free(r);
5080                         return k;
5081                 }
5082
5083                 /* When we don't match anything, -ENOENT should be returned */
5084                 assert(pglob.gl_pathc > 0);
5085
5086                 for (unsigned n = 0; n < pglob.gl_pathc; n++) {
5087                         k = load_env_file(NULL, pglob.gl_pathv[n], &p);
5088                         if (k < 0) {
5089                                 if (ignore)
5090                                         continue;
5091
5092                                 strv_free(r);
5093                                 return k;
5094                         }
5095                         /* Log invalid environment variables with filename */
5096                         if (p) {
5097                                 InvalidEnvInfo info = {
5098                                         .unit = unit,
5099                                         .path = pglob.gl_pathv[n]
5100                                 };
5101
5102                                 p = strv_env_clean_with_callback(p, invalid_env, &info);
5103                         }
5104
5105                         if (!r)
5106                                 r = p;
5107                         else {
5108                                 char **m;
5109
5110                                 m = strv_env_merge(2, r, p);
5111                                 strv_free(r);
5112                                 strv_free(p);
5113                                 if (!m)
5114                                         return -ENOMEM;
5115
5116                                 r = m;
5117                         }
5118                 }
5119         }
5120
5121         *l = r;
5122
5123         return 0;
5124 }
5125
5126 static bool tty_may_match_dev_console(const char *tty) {
5127         _cleanup_free_ char *resolved = NULL;
5128
5129         if (!tty)
5130                 return true;
5131
5132         tty = skip_dev_prefix(tty);
5133
5134         /* trivial identity? */
5135         if (streq(tty, "console"))
5136                 return true;
5137
5138         if (resolve_dev_console(&resolved) < 0)
5139                 return true; /* if we could not resolve, assume it may */
5140
5141         /* "tty0" means the active VC, so it may be the same sometimes */
5142         return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
5143 }
5144
5145 static bool exec_context_may_touch_tty(const ExecContext *ec) {
5146         assert(ec);
5147
5148         return ec->tty_reset ||
5149                 ec->tty_vhangup ||
5150                 ec->tty_vt_disallocate ||
5151                 is_terminal_input(ec->std_input) ||
5152                 is_terminal_output(ec->std_output) ||
5153                 is_terminal_output(ec->std_error);
5154 }
5155
5156 bool exec_context_may_touch_console(const ExecContext *ec) {
5157
5158         return exec_context_may_touch_tty(ec) &&
5159                tty_may_match_dev_console(exec_context_tty_path(ec));
5160 }
5161
5162 static void strv_fprintf(FILE *f, char **l) {
5163         char **g;
5164
5165         assert(f);
5166
5167         STRV_FOREACH(g, l)
5168                 fprintf(f, " %s", *g);
5169 }
5170
5171 static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
5172         assert(f);
5173         assert(prefix);
5174         assert(name);
5175
5176         if (!strv_isempty(strv)) {
5177                 fprintf(f, "%s%s:", name, prefix);
5178                 strv_fprintf(f, strv);
5179                 fputs("\n", f);
5180         }
5181 }
5182
5183 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
5184         char **e, **d, buf_clean[FORMAT_TIMESPAN_MAX];
5185         int r;
5186
5187         assert(c);
5188         assert(f);
5189
5190         prefix = strempty(prefix);
5191
5192         fprintf(f,
5193                 "%sUMask: %04o\n"
5194                 "%sWorkingDirectory: %s\n"
5195                 "%sRootDirectory: %s\n"
5196                 "%sNonBlocking: %s\n"
5197                 "%sPrivateTmp: %s\n"
5198                 "%sPrivateDevices: %s\n"
5199                 "%sProtectKernelTunables: %s\n"
5200                 "%sProtectKernelModules: %s\n"
5201                 "%sProtectKernelLogs: %s\n"
5202                 "%sProtectClock: %s\n"
5203                 "%sProtectControlGroups: %s\n"
5204                 "%sPrivateNetwork: %s\n"
5205                 "%sPrivateUsers: %s\n"
5206                 "%sProtectHome: %s\n"
5207                 "%sProtectSystem: %s\n"
5208                 "%sMountAPIVFS: %s\n"
5209                 "%sIgnoreSIGPIPE: %s\n"
5210                 "%sMemoryDenyWriteExecute: %s\n"
5211                 "%sRestrictRealtime: %s\n"
5212                 "%sRestrictSUIDSGID: %s\n"
5213                 "%sKeyringMode: %s\n"
5214                 "%sProtectHostname: %s\n"
5215                 "%sProtectProc: %s\n"
5216                 "%sProcSubset: %s\n",
5217                 prefix, c->umask,
5218                 prefix, empty_to_root(c->working_directory),
5219                 prefix, empty_to_root(c->root_directory),
5220                 prefix, yes_no(c->non_blocking),
5221                 prefix, yes_no(c->private_tmp),
5222                 prefix, yes_no(c->private_devices),
5223                 prefix, yes_no(c->protect_kernel_tunables),
5224                 prefix, yes_no(c->protect_kernel_modules),
5225                 prefix, yes_no(c->protect_kernel_logs),
5226                 prefix, yes_no(c->protect_clock),
5227                 prefix, yes_no(c->protect_control_groups),
5228                 prefix, yes_no(c->private_network),
5229                 prefix, yes_no(c->private_users),
5230                 prefix, protect_home_to_string(c->protect_home),
5231                 prefix, protect_system_to_string(c->protect_system),
5232                 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
5233                 prefix, yes_no(c->ignore_sigpipe),
5234                 prefix, yes_no(c->memory_deny_write_execute),
5235                 prefix, yes_no(c->restrict_realtime),
5236                 prefix, yes_no(c->restrict_suid_sgid),
5237                 prefix, exec_keyring_mode_to_string(c->keyring_mode),
5238                 prefix, yes_no(c->protect_hostname),
5239                 prefix, protect_proc_to_string(c->protect_proc),
5240                 prefix, proc_subset_to_string(c->proc_subset));
5241
5242         if (c->root_image)
5243                 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
5244
5245         if (c->root_image_options) {
5246                 MountOptions *o;
5247
5248                 fprintf(f, "%sRootImageOptions:", prefix);
5249                 LIST_FOREACH(mount_options, o, c->root_image_options)
5250                         if (!isempty(o->options))
5251                                 fprintf(f, " %s:%s",
5252                                         partition_designator_to_string(o->partition_designator),
5253                                         o->options);
5254                 fprintf(f, "\n");
5255         }
5256
5257         if (c->root_hash) {
5258                 _cleanup_free_ char *encoded = NULL;
5259                 encoded = hexmem(c->root_hash, c->root_hash_size);
5260                 if (encoded)
5261                         fprintf(f, "%sRootHash: %s\n", prefix, encoded);
5262         }
5263
5264         if (c->root_hash_path)
5265                 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
5266
5267         if (c->root_hash_sig) {
5268                 _cleanup_free_ char *encoded = NULL;
5269                 ssize_t len;
5270                 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
5271                 if (len)
5272                         fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
5273         }
5274
5275         if (c->root_hash_sig_path)
5276                 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
5277
5278         if (c->root_verity)
5279                 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
5280
5281         STRV_FOREACH(e, c->environment)
5282                 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
5283
5284         STRV_FOREACH(e, c->environment_files)
5285                 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
5286
5287         STRV_FOREACH(e, c->pass_environment)
5288                 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
5289
5290         STRV_FOREACH(e, c->unset_environment)
5291                 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
5292
5293         fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
5294
5295         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
5296                 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
5297
5298                 STRV_FOREACH(d, c->directories[dt].paths)
5299                         fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
5300         }
5301
5302         fprintf(f,
5303                 "%sTimeoutCleanSec: %s\n",
5304                 prefix, format_timespan(buf_clean, sizeof(buf_clean), c->timeout_clean_usec, USEC_PER_SEC));
5305
5306         if (c->nice_set)
5307                 fprintf(f,
5308                         "%sNice: %i\n",
5309                         prefix, c->nice);
5310
5311         if (c->oom_score_adjust_set)
5312                 fprintf(f,
5313                         "%sOOMScoreAdjust: %i\n",
5314                         prefix, c->oom_score_adjust);
5315
5316         if (c->coredump_filter_set)
5317                 fprintf(f,
5318                         "%sCoredumpFilter: 0x%"PRIx64"\n",
5319                         prefix, c->coredump_filter);
5320
5321         for (unsigned i = 0; i < RLIM_NLIMITS; i++)
5322                 if (c->rlimit[i]) {
5323                         fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
5324                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
5325                         fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
5326                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
5327                 }
5328
5329         if (c->ioprio_set) {
5330                 _cleanup_free_ char *class_str = NULL;
5331
5332                 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
5333                 if (r >= 0)
5334                         fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
5335
5336                 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
5337         }
5338
5339         if (c->cpu_sched_set) {
5340                 _cleanup_free_ char *policy_str = NULL;
5341
5342                 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
5343                 if (r >= 0)
5344                         fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
5345
5346                 fprintf(f,
5347                         "%sCPUSchedulingPriority: %i\n"
5348                         "%sCPUSchedulingResetOnFork: %s\n",
5349                         prefix, c->cpu_sched_priority,
5350                         prefix, yes_no(c->cpu_sched_reset_on_fork));
5351         }
5352
5353         if (c->cpu_set.set) {
5354                 _cleanup_free_ char *affinity = NULL;
5355
5356                 affinity = cpu_set_to_range_string(&c->cpu_set);
5357                 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
5358         }
5359
5360         if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
5361                 _cleanup_free_ char *nodes = NULL;
5362
5363                 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
5364                 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
5365                 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
5366         }
5367
5368         if (c->timer_slack_nsec != NSEC_INFINITY)
5369                 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
5370
5371         fprintf(f,
5372                 "%sStandardInput: %s\n"
5373                 "%sStandardOutput: %s\n"
5374                 "%sStandardError: %s\n",
5375                 prefix, exec_input_to_string(c->std_input),
5376                 prefix, exec_output_to_string(c->std_output),
5377                 prefix, exec_output_to_string(c->std_error));
5378
5379         if (c->std_input == EXEC_INPUT_NAMED_FD)
5380                 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
5381         if (c->std_output == EXEC_OUTPUT_NAMED_FD)
5382                 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
5383         if (c->std_error == EXEC_OUTPUT_NAMED_FD)
5384                 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
5385
5386         if (c->std_input == EXEC_INPUT_FILE)
5387                 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
5388         if (c->std_output == EXEC_OUTPUT_FILE)
5389                 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5390         if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
5391                 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5392         if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
5393                 fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5394         if (c->std_error == EXEC_OUTPUT_FILE)
5395                 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5396         if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
5397                 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5398         if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
5399                 fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5400
5401         if (c->tty_path)
5402                 fprintf(f,
5403                         "%sTTYPath: %s\n"
5404                         "%sTTYReset: %s\n"
5405                         "%sTTYVHangup: %s\n"
5406                         "%sTTYVTDisallocate: %s\n",
5407                         prefix, c->tty_path,
5408                         prefix, yes_no(c->tty_reset),
5409                         prefix, yes_no(c->tty_vhangup),
5410                         prefix, yes_no(c->tty_vt_disallocate));
5411
5412         if (IN_SET(c->std_output,
5413                    EXEC_OUTPUT_KMSG,
5414                    EXEC_OUTPUT_JOURNAL,
5415                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
5416                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
5417             IN_SET(c->std_error,
5418                    EXEC_OUTPUT_KMSG,
5419                    EXEC_OUTPUT_JOURNAL,
5420                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
5421                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
5422
5423                 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
5424
5425                 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
5426                 if (r >= 0)
5427                         fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
5428
5429                 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
5430                 if (r >= 0)
5431                         fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
5432         }
5433
5434         if (c->log_level_max >= 0) {
5435                 _cleanup_free_ char *t = NULL;
5436
5437                 (void) log_level_to_string_alloc(c->log_level_max, &t);
5438
5439                 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
5440         }
5441
5442         if (c->log_ratelimit_interval_usec > 0) {
5443                 char buf_timespan[FORMAT_TIMESPAN_MAX];
5444
5445                 fprintf(f,
5446                         "%sLogRateLimitIntervalSec: %s\n",
5447                         prefix, format_timespan(buf_timespan, sizeof(buf_timespan), c->log_ratelimit_interval_usec, USEC_PER_SEC));
5448         }
5449
5450         if (c->log_ratelimit_burst > 0)
5451                 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
5452
5453         for (size_t j = 0; j < c->n_log_extra_fields; j++) {
5454                 fprintf(f, "%sLogExtraFields: ", prefix);
5455                 fwrite(c->log_extra_fields[j].iov_base,
5456                        1, c->log_extra_fields[j].iov_len,
5457                        f);
5458                 fputc('\n', f);
5459         }
5460
5461         if (c->log_namespace)
5462                 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
5463
5464         if (c->secure_bits) {
5465                 _cleanup_free_ char *str = NULL;
5466
5467                 r = secure_bits_to_string_alloc(c->secure_bits, &str);
5468                 if (r >= 0)
5469                         fprintf(f, "%sSecure Bits: %s\n", prefix, str);
5470         }
5471
5472         if (c->capability_bounding_set != CAP_ALL) {
5473                 _cleanup_free_ char *str = NULL;
5474
5475                 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
5476                 if (r >= 0)
5477                         fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
5478         }
5479
5480         if (c->capability_ambient_set != 0) {
5481                 _cleanup_free_ char *str = NULL;
5482
5483                 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
5484                 if (r >= 0)
5485                         fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
5486         }
5487
5488         if (c->user)
5489                 fprintf(f, "%sUser: %s\n", prefix, c->user);
5490         if (c->group)
5491                 fprintf(f, "%sGroup: %s\n", prefix, c->group);
5492
5493         fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
5494
5495         strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
5496
5497         if (c->pam_name)
5498                 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
5499
5500         strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
5501         strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
5502         strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
5503         strv_dump(f, prefix, "ExecPaths", c->exec_paths);
5504         strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
5505
5506         for (size_t i = 0; i < c->n_bind_mounts; i++)
5507                 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
5508                         c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
5509                         c->bind_mounts[i].ignore_enoent ? "-": "",
5510                         c->bind_mounts[i].source,
5511                         c->bind_mounts[i].destination,
5512                         c->bind_mounts[i].recursive ? "rbind" : "norbind");
5513
5514         for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
5515                 const TemporaryFileSystem *t = c->temporary_filesystems + i;
5516
5517                 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
5518                         t->path,
5519                         isempty(t->options) ? "" : ":",
5520                         strempty(t->options));
5521         }
5522
5523         if (c->utmp_id)
5524                 fprintf(f,
5525                         "%sUtmpIdentifier: %s\n",
5526                         prefix, c->utmp_id);
5527
5528         if (c->selinux_context)
5529                 fprintf(f,
5530                         "%sSELinuxContext: %s%s\n",
5531                         prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
5532
5533         if (c->apparmor_profile)
5534                 fprintf(f,
5535                         "%sAppArmorProfile: %s%s\n",
5536                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
5537
5538         if (c->smack_process_label)
5539                 fprintf(f,
5540                         "%sSmackProcessLabel: %s%s\n",
5541                         prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
5542
5543         if (c->personality != PERSONALITY_INVALID)
5544                 fprintf(f,
5545                         "%sPersonality: %s\n",
5546                         prefix, strna(personality_to_string(c->personality)));
5547
5548         fprintf(f,
5549                 "%sLockPersonality: %s\n",
5550                 prefix, yes_no(c->lock_personality));
5551
5552         if (c->syscall_filter) {
5553 #if HAVE_SECCOMP
5554                 void *id, *val;
5555                 bool first = true;
5556 #endif
5557
5558                 fprintf(f,
5559                         "%sSystemCallFilter: ",
5560                         prefix);
5561
5562                 if (!c->syscall_allow_list)
5563                         fputc('~', f);
5564
5565 #if HAVE_SECCOMP
5566                 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
5567                         _cleanup_free_ char *name = NULL;
5568                         const char *errno_name = NULL;
5569                         int num = PTR_TO_INT(val);
5570
5571                         if (first)
5572                                 first = false;
5573                         else
5574                                 fputc(' ', f);
5575
5576                         name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
5577                         fputs(strna(name), f);
5578
5579                         if (num >= 0) {
5580                                 errno_name = seccomp_errno_or_action_to_string(num);
5581                                 if (errno_name)
5582                                         fprintf(f, ":%s", errno_name);
5583                                 else
5584                                         fprintf(f, ":%d", num);
5585                         }
5586                 }
5587 #endif
5588
5589                 fputc('\n', f);
5590         }
5591
5592         if (c->syscall_archs) {
5593 #if HAVE_SECCOMP
5594                 void *id;
5595 #endif
5596
5597                 fprintf(f,
5598                         "%sSystemCallArchitectures:",
5599                         prefix);
5600
5601 #if HAVE_SECCOMP
5602                 SET_FOREACH(id, c->syscall_archs)
5603                         fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
5604 #endif
5605                 fputc('\n', f);
5606         }
5607
5608         if (exec_context_restrict_namespaces_set(c)) {
5609                 _cleanup_free_ char *s = NULL;
5610
5611                 r = namespace_flags_to_string(c->restrict_namespaces, &s);
5612                 if (r >= 0)
5613                         fprintf(f, "%sRestrictNamespaces: %s\n",
5614                                 prefix, strna(s));
5615         }
5616
5617         if (c->network_namespace_path)
5618                 fprintf(f,
5619                         "%sNetworkNamespacePath: %s\n",
5620                         prefix, c->network_namespace_path);
5621
5622         if (c->syscall_errno > 0) {
5623 #if HAVE_SECCOMP
5624                 const char *errno_name;
5625 #endif
5626
5627                 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
5628
5629 #if HAVE_SECCOMP
5630                 errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
5631                 if (errno_name)
5632                         fputs(errno_name, f);
5633                 else
5634                         fprintf(f, "%d", c->syscall_errno);
5635 #endif
5636                 fputc('\n', f);
5637         }
5638
5639         for (size_t i = 0; i < c->n_mount_images; i++) {
5640                 MountOptions *o;
5641
5642                 fprintf(f, "%sMountImages: %s%s:%s%s", prefix,
5643                         c->mount_images[i].ignore_enoent ? "-": "",
5644                         c->mount_images[i].source,
5645                         c->mount_images[i].destination,
5646                         LIST_IS_EMPTY(c->mount_images[i].mount_options) ? "": ":");
5647                 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
5648                         fprintf(f, "%s:%s",
5649                                 partition_designator_to_string(o->partition_designator),
5650                                 o->options);
5651                 fprintf(f, "\n");
5652         }
5653 }
5654
5655 bool exec_context_maintains_privileges(const ExecContext *c) {
5656         assert(c);
5657
5658         /* Returns true if the process forked off would run under
5659          * an unchanged UID or as root. */
5660
5661         if (!c->user)
5662                 return true;
5663
5664         if (streq(c->user, "root") || streq(c->user, "0"))
5665                 return true;
5666
5667         return false;
5668 }
5669
5670 int exec_context_get_effective_ioprio(const ExecContext *c) {
5671         int p;
5672
5673         assert(c);
5674
5675         if (c->ioprio_set)
5676                 return c->ioprio;
5677
5678         p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
5679         if (p < 0)
5680                 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
5681
5682         return p;
5683 }
5684
5685 bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
5686         assert(c);
5687
5688         /* Explicit setting wins */
5689         if (c->mount_apivfs_set)
5690                 return c->mount_apivfs;
5691
5692         /* Default to "yes" if root directory or image are specified */
5693         if (exec_context_with_rootfs(c))
5694                 return true;
5695
5696         return false;
5697 }
5698
5699 void exec_context_free_log_extra_fields(ExecContext *c) {
5700         assert(c);
5701
5702         for (size_t l = 0; l < c->n_log_extra_fields; l++)
5703                 free(c->log_extra_fields[l].iov_base);
5704         c->log_extra_fields = mfree(c->log_extra_fields);
5705         c->n_log_extra_fields = 0;
5706 }
5707
5708 void exec_context_revert_tty(ExecContext *c) {
5709         int r;
5710
5711         assert(c);
5712
5713         /* First, reset the TTY (possibly kicking everybody else from the TTY) */
5714         exec_context_tty_reset(c, NULL);
5715
5716         /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
5717          * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
5718          * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
5719
5720         if (exec_context_may_touch_tty(c)) {
5721                 const char *path;
5722
5723                 path = exec_context_tty_path(c);
5724                 if (path) {
5725                         r = chmod_and_chown(path, TTY_MODE, 0, TTY_GID);
5726                         if (r < 0 && r != -ENOENT)
5727                                 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
5728                 }
5729         }
5730 }
5731
5732 int exec_context_get_clean_directories(
5733                 ExecContext *c,
5734                 char **prefix,
5735                 ExecCleanMask mask,
5736                 char ***ret) {
5737
5738         _cleanup_strv_free_ char **l = NULL;
5739         int r;
5740
5741         assert(c);
5742         assert(prefix);
5743         assert(ret);
5744
5745         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
5746                 char **i;
5747
5748                 if (!FLAGS_SET(mask, 1U << t))
5749                         continue;
5750
5751                 if (!prefix[t])
5752                         continue;
5753
5754                 STRV_FOREACH(i, c->directories[t].paths) {
5755                         char *j;
5756
5757                         j = path_join(prefix[t], *i);
5758                         if (!j)
5759                                 return -ENOMEM;
5760
5761                         r = strv_consume(&l, j);
5762                         if (r < 0)
5763                                 return r;
5764
5765                         /* Also remove private directories unconditionally. */
5766                         if (t != EXEC_DIRECTORY_CONFIGURATION) {
5767                                 j = path_join(prefix[t], "private", *i);
5768                                 if (!j)
5769                                         return -ENOMEM;
5770
5771                                 r = strv_consume(&l, j);
5772                                 if (r < 0)
5773                                         return r;
5774                         }
5775                 }
5776         }
5777
5778         *ret = TAKE_PTR(l);
5779         return 0;
5780 }
5781
5782 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
5783         ExecCleanMask mask = 0;
5784
5785         assert(c);
5786         assert(ret);
5787
5788         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5789                 if (!strv_isempty(c->directories[t].paths))
5790                         mask |= 1U << t;
5791
5792         *ret = mask;
5793         return 0;
5794 }
5795
5796 void exec_status_start(ExecStatus *s, pid_t pid) {
5797         assert(s);
5798
5799         *s = (ExecStatus) {
5800                 .pid = pid,
5801         };
5802
5803         dual_timestamp_get(&s->start_timestamp);
5804 }
5805
5806 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
5807         assert(s);
5808
5809         if (s->pid != pid)
5810                 *s = (ExecStatus) {
5811                         .pid = pid,
5812                 };
5813
5814         dual_timestamp_get(&s->exit_timestamp);
5815
5816         s->code = code;
5817         s->status = status;
5818
5819         if (context && context->utmp_id)
5820                 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
5821 }
5822
5823 void exec_status_reset(ExecStatus *s) {
5824         assert(s);
5825
5826         *s = (ExecStatus) {};
5827 }
5828
5829 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
5830         char buf[FORMAT_TIMESTAMP_MAX];
5831
5832         assert(s);
5833         assert(f);
5834
5835         if (s->pid <= 0)
5836                 return;
5837
5838         prefix = strempty(prefix);
5839
5840         fprintf(f,
5841                 "%sPID: "PID_FMT"\n",
5842                 prefix, s->pid);
5843
5844         if (dual_timestamp_is_set(&s->start_timestamp))
5845                 fprintf(f,
5846                         "%sStart Timestamp: %s\n",
5847                         prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
5848
5849         if (dual_timestamp_is_set(&s->exit_timestamp))
5850                 fprintf(f,
5851                         "%sExit Timestamp: %s\n"
5852                         "%sExit Code: %s\n"
5853                         "%sExit Status: %i\n",
5854                         prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
5855                         prefix, sigchld_code_to_string(s->code),
5856                         prefix, s->status);
5857 }
5858
5859 static char *exec_command_line(char **argv) {
5860         size_t k;
5861         char *n, *p, **a;
5862         bool first = true;
5863
5864         assert(argv);
5865
5866         k = 1;
5867         STRV_FOREACH(a, argv)
5868                 k += strlen(*a)+3;
5869
5870         n = new(char, k);
5871         if (!n)
5872                 return NULL;
5873
5874         p = n;
5875         STRV_FOREACH(a, argv) {
5876
5877                 if (!first)
5878                         *(p++) = ' ';
5879                 else
5880                         first = false;
5881
5882                 if (strpbrk(*a, WHITESPACE)) {
5883                         *(p++) = '\'';
5884                         p = stpcpy(p, *a);
5885                         *(p++) = '\'';
5886                 } else
5887                         p = stpcpy(p, *a);
5888
5889         }
5890
5891         *p = 0;
5892
5893         /* FIXME: this doesn't really handle arguments that have
5894          * spaces and ticks in them */
5895
5896         return n;
5897 }
5898
5899 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
5900         _cleanup_free_ char *cmd = NULL;
5901         const char *prefix2;
5902
5903         assert(c);
5904         assert(f);
5905
5906         prefix = strempty(prefix);
5907         prefix2 = strjoina(prefix, "\t");
5908
5909         cmd = exec_command_line(c->argv);
5910         fprintf(f,
5911                 "%sCommand Line: %s\n",
5912                 prefix, cmd ? cmd : strerror_safe(ENOMEM));
5913
5914         exec_status_dump(&c->exec_status, f, prefix2);
5915 }
5916
5917 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
5918         assert(f);
5919
5920         prefix = strempty(prefix);
5921
5922         LIST_FOREACH(command, c, c)
5923                 exec_command_dump(c, f, prefix);
5924 }
5925
5926 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
5927         ExecCommand *end;
5928
5929         assert(l);
5930         assert(e);
5931
5932         if (*l) {
5933                 /* It's kind of important, that we keep the order here */
5934                 LIST_FIND_TAIL(command, *l, end);
5935                 LIST_INSERT_AFTER(command, *l, end, e);
5936         } else
5937               *l = e;
5938 }
5939
5940 int exec_command_set(ExecCommand *c, const char *path, ...) {
5941         va_list ap;
5942         char **l, *p;
5943
5944         assert(c);
5945         assert(path);
5946
5947         va_start(ap, path);
5948         l = strv_new_ap(path, ap);
5949         va_end(ap);
5950
5951         if (!l)
5952                 return -ENOMEM;
5953
5954         p = strdup(path);
5955         if (!p) {
5956                 strv_free(l);
5957                 return -ENOMEM;
5958         }
5959
5960         free_and_replace(c->path, p);
5961
5962         return strv_free_and_replace(c->argv, l);
5963 }
5964
5965 int exec_command_append(ExecCommand *c, const char *path, ...) {
5966         _cleanup_strv_free_ char **l = NULL;
5967         va_list ap;
5968         int r;
5969
5970         assert(c);
5971         assert(path);
5972
5973         va_start(ap, path);
5974         l = strv_new_ap(path, ap);
5975         va_end(ap);
5976
5977         if (!l)
5978                 return -ENOMEM;
5979
5980         r = strv_extend_strv(&c->argv, l, false);
5981         if (r < 0)
5982                 return r;
5983
5984         return 0;
5985 }
5986
5987 static void *remove_tmpdir_thread(void *p) {
5988         _cleanup_free_ char *path = p;
5989
5990         (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
5991         return NULL;
5992 }
5993
5994 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
5995         int r;
5996
5997         if (!rt)
5998                 return NULL;
5999
6000         if (rt->manager)
6001                 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
6002
6003         /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
6004
6005         if (destroy && rt->tmp_dir && !streq(rt->tmp_dir, RUN_SYSTEMD_EMPTY)) {
6006                 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
6007
6008                 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
6009                 if (r < 0)
6010                         log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
6011                 else
6012                         rt->tmp_dir = NULL;
6013         }
6014
6015         if (destroy && rt->var_tmp_dir && !streq(rt->var_tmp_dir, RUN_SYSTEMD_EMPTY)) {
6016                 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
6017
6018                 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
6019                 if (r < 0)
6020                         log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
6021                 else
6022                         rt->var_tmp_dir = NULL;
6023         }
6024
6025         rt->id = mfree(rt->id);
6026         rt->tmp_dir = mfree(rt->tmp_dir);
6027         rt->var_tmp_dir = mfree(rt->var_tmp_dir);
6028         safe_close_pair(rt->netns_storage_socket);
6029         return mfree(rt);
6030 }
6031
6032 static void exec_runtime_freep(ExecRuntime **rt) {
6033         (void) exec_runtime_free(*rt, false);
6034 }
6035
6036 static int exec_runtime_allocate(ExecRuntime **ret, const char *id) {
6037         _cleanup_free_ char *id_copy = NULL;
6038         ExecRuntime *n;
6039
6040         assert(ret);
6041
6042         id_copy = strdup(id);
6043         if (!id_copy)
6044                 return -ENOMEM;
6045
6046         n = new(ExecRuntime, 1);
6047         if (!n)
6048                 return -ENOMEM;
6049
6050         *n = (ExecRuntime) {
6051                 .id = TAKE_PTR(id_copy),
6052                 .netns_storage_socket = { -1, -1 },
6053         };
6054
6055         *ret = n;
6056         return 0;
6057 }
6058
6059 static int exec_runtime_add(
6060                 Manager *m,
6061                 const char *id,
6062                 char **tmp_dir,
6063                 char **var_tmp_dir,
6064                 int netns_storage_socket[2],
6065                 ExecRuntime **ret) {
6066
6067         _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
6068         int r;
6069
6070         assert(m);
6071         assert(id);
6072
6073         /* tmp_dir, var_tmp_dir, netns_storage_socket fds are donated on success */
6074
6075         r = exec_runtime_allocate(&rt, id);
6076         if (r < 0)
6077                 return r;
6078
6079         r = hashmap_ensure_put(&m->exec_runtime_by_id, &string_hash_ops, rt->id, rt);
6080         if (r < 0)
6081                 return r;
6082
6083         assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
6084         rt->tmp_dir = TAKE_PTR(*tmp_dir);
6085         rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
6086
6087         if (netns_storage_socket) {
6088                 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
6089                 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
6090         }
6091
6092         rt->manager = m;
6093
6094         if (ret)
6095                 *ret = rt;
6096         /* do not remove created ExecRuntime object when the operation succeeds. */
6097         TAKE_PTR(rt);
6098         return 0;
6099 }
6100
6101 static int exec_runtime_make(
6102                 Manager *m,
6103                 const ExecContext *c,
6104                 const char *id,
6105                 ExecRuntime **ret) {
6106
6107         _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
6108         _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 };
6109         int r;
6110
6111         assert(m);
6112         assert(c);
6113         assert(id);
6114
6115         /* It is not necessary to create ExecRuntime object. */
6116         if (!c->private_network && !c->private_tmp && !c->network_namespace_path) {
6117                 *ret = NULL;
6118                 return 0;
6119         }
6120
6121         if (c->private_tmp &&
6122             !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
6123               (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
6124                prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
6125                 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
6126                 if (r < 0)
6127                         return r;
6128         }
6129
6130         if (c->private_network || c->network_namespace_path) {
6131                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
6132                         return -errno;
6133         }
6134
6135         r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ret);
6136         if (r < 0)
6137                 return r;
6138
6139         return 1;
6140 }
6141
6142 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
6143         ExecRuntime *rt;
6144         int r;
6145
6146         assert(m);
6147         assert(id);
6148         assert(ret);
6149
6150         rt = hashmap_get(m->exec_runtime_by_id, id);
6151         if (rt)
6152                 /* We already have a ExecRuntime object, let's increase the ref count and reuse it */
6153                 goto ref;
6154
6155         if (!create) {
6156                 *ret = NULL;
6157                 return 0;
6158         }
6159
6160         /* If not found, then create a new object. */
6161         r = exec_runtime_make(m, c, id, &rt);
6162         if (r < 0)
6163                 return r;
6164         if (r == 0) {
6165                 /* When r == 0, it is not necessary to create ExecRuntime object. */
6166                 *ret = NULL;
6167                 return 0;
6168         }
6169
6170 ref:
6171         /* increment reference counter. */
6172         rt->n_ref++;
6173         *ret = rt;
6174         return 1;
6175 }
6176
6177 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
6178         if (!rt)
6179                 return NULL;
6180
6181         assert(rt->n_ref > 0);
6182
6183         rt->n_ref--;
6184         if (rt->n_ref > 0)
6185                 return NULL;
6186
6187         return exec_runtime_free(rt, destroy);
6188 }
6189
6190 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
6191         ExecRuntime *rt;
6192
6193         assert(m);
6194         assert(f);
6195         assert(fds);
6196
6197         HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
6198                 fprintf(f, "exec-runtime=%s", rt->id);
6199
6200                 if (rt->tmp_dir)
6201                         fprintf(f, " tmp-dir=%s", rt->tmp_dir);
6202
6203                 if (rt->var_tmp_dir)
6204                         fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
6205
6206                 if (rt->netns_storage_socket[0] >= 0) {
6207                         int copy;
6208
6209                         copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
6210                         if (copy < 0)
6211                                 return copy;
6212
6213                         fprintf(f, " netns-socket-0=%i", copy);
6214                 }
6215
6216                 if (rt->netns_storage_socket[1] >= 0) {
6217                         int copy;
6218
6219                         copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
6220                         if (copy < 0)
6221                                 return copy;
6222
6223                         fprintf(f, " netns-socket-1=%i", copy);
6224                 }
6225
6226                 fputc('\n', f);
6227         }
6228
6229         return 0;
6230 }
6231
6232 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
6233         _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
6234         ExecRuntime *rt;
6235         int r;
6236
6237         /* This is for the migration from old (v237 or earlier) deserialization text.
6238          * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
6239          * Even if the ExecRuntime object originally created by the other unit, we cannot judge
6240          * so or not from the serialized text, then we always creates a new object owned by this. */
6241
6242         assert(u);
6243         assert(key);
6244         assert(value);
6245
6246         /* Manager manages ExecRuntime objects by the unit id.
6247          * So, we omit the serialized text when the unit does not have id (yet?)... */
6248         if (isempty(u->id)) {
6249                 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
6250                 return 0;
6251         }
6252
6253         r = hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops);
6254         if (r < 0) {
6255                 log_unit_debug_errno(u, r, "Failed to allocate storage for runtime parameter: %m");
6256                 return 0;
6257         }
6258
6259         rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
6260         if (!rt) {
6261                 r = exec_runtime_allocate(&rt_create, u->id);
6262                 if (r < 0)
6263                         return log_oom();
6264
6265                 rt = rt_create;
6266         }
6267
6268         if (streq(key, "tmp-dir")) {
6269                 char *copy;
6270
6271                 copy = strdup(value);
6272                 if (!copy)
6273                         return log_oom();
6274
6275                 free_and_replace(rt->tmp_dir, copy);
6276
6277         } else if (streq(key, "var-tmp-dir")) {
6278                 char *copy;
6279
6280                 copy = strdup(value);
6281                 if (!copy)
6282                         return log_oom();
6283
6284                 free_and_replace(rt->var_tmp_dir, copy);
6285
6286         } else if (streq(key, "netns-socket-0")) {
6287                 int fd;
6288
6289                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
6290                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
6291                         return 0;
6292                 }
6293
6294                 safe_close(rt->netns_storage_socket[0]);
6295                 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
6296
6297         } else if (streq(key, "netns-socket-1")) {
6298                 int fd;
6299
6300                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
6301                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
6302                         return 0;
6303                 }
6304
6305                 safe_close(rt->netns_storage_socket[1]);
6306                 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
6307         } else
6308                 return 0;
6309
6310         /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
6311         if (rt_create) {
6312                 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
6313                 if (r < 0) {
6314                         log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
6315                         return 0;
6316                 }
6317
6318                 rt_create->manager = u->manager;
6319
6320                 /* Avoid cleanup */
6321                 TAKE_PTR(rt_create);
6322         }
6323
6324         return 1;
6325 }
6326
6327 int exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
6328         _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
6329         char *id = NULL;
6330         int r, fdpair[] = {-1, -1};
6331         const char *p, *v = value;
6332         size_t n;
6333
6334         assert(m);
6335         assert(value);
6336         assert(fds);
6337
6338         n = strcspn(v, " ");
6339         id = strndupa(v, n);
6340         if (v[n] != ' ')
6341                 goto finalize;
6342         p = v + n + 1;
6343
6344         v = startswith(p, "tmp-dir=");
6345         if (v) {
6346                 n = strcspn(v, " ");
6347                 tmp_dir = strndup(v, n);
6348                 if (!tmp_dir)
6349                         return log_oom();
6350                 if (v[n] != ' ')
6351                         goto finalize;
6352                 p = v + n + 1;
6353         }
6354
6355         v = startswith(p, "var-tmp-dir=");
6356         if (v) {
6357                 n = strcspn(v, " ");
6358                 var_tmp_dir = strndup(v, n);
6359                 if (!var_tmp_dir)
6360                         return log_oom();
6361                 if (v[n] != ' ')
6362                         goto finalize;
6363                 p = v + n + 1;
6364         }
6365
6366         v = startswith(p, "netns-socket-0=");
6367         if (v) {
6368                 char *buf;
6369
6370                 n = strcspn(v, " ");
6371                 buf = strndupa(v, n);
6372
6373                 r = safe_atoi(buf, &fdpair[0]);
6374                 if (r < 0)
6375                         return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
6376                 if (!fdset_contains(fds, fdpair[0]))
6377                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6378                                                "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", fdpair[0]);
6379                 fdpair[0] = fdset_remove(fds, fdpair[0]);
6380                 if (v[n] != ' ')
6381                         goto finalize;
6382                 p = v + n + 1;
6383         }
6384
6385         v = startswith(p, "netns-socket-1=");
6386         if (v) {
6387                 char *buf;
6388
6389                 n = strcspn(v, " ");
6390                 buf = strndupa(v, n);
6391                 r = safe_atoi(buf, &fdpair[1]);
6392                 if (r < 0)
6393                         return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
6394                 if (!fdset_contains(fds, fdpair[1]))
6395                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6396                                                "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", fdpair[1]);
6397                 fdpair[1] = fdset_remove(fds, fdpair[1]);
6398         }
6399
6400 finalize:
6401         r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, fdpair, NULL);
6402         if (r < 0)
6403                 return log_debug_errno(r, "Failed to add exec-runtime: %m");
6404         return 0;
6405 }
6406
6407 void exec_runtime_vacuum(Manager *m) {
6408         ExecRuntime *rt;
6409
6410         assert(m);
6411
6412         /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
6413
6414         HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
6415                 if (rt->n_ref > 0)
6416                         continue;
6417
6418                 (void) exec_runtime_free(rt, false);
6419         }
6420 }
6421
6422 void exec_params_clear(ExecParameters *p) {
6423         if (!p)
6424                 return;
6425
6426         p->environment = strv_free(p->environment);
6427         p->fd_names = strv_free(p->fd_names);
6428         p->fds = mfree(p->fds);
6429         p->exec_fd = safe_close(p->exec_fd);
6430 }
6431
6432 ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) {
6433         if (!sc)
6434                 return NULL;
6435
6436         free(sc->id);
6437         free(sc->data);
6438         return mfree(sc);
6439 }
6440
6441 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_set_credential_hash_ops, char, string_hash_func, string_compare_func, ExecSetCredential, exec_set_credential_free);
6442
6443 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
6444         [EXEC_INPUT_NULL] = "null",
6445         [EXEC_INPUT_TTY] = "tty",
6446         [EXEC_INPUT_TTY_FORCE] = "tty-force",
6447         [EXEC_INPUT_TTY_FAIL] = "tty-fail",
6448         [EXEC_INPUT_SOCKET] = "socket",
6449         [EXEC_INPUT_NAMED_FD] = "fd",
6450         [EXEC_INPUT_DATA] = "data",
6451         [EXEC_INPUT_FILE] = "file",
6452 };
6453
6454 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
6455
6456 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
6457         [EXEC_OUTPUT_INHERIT] = "inherit",
6458         [EXEC_OUTPUT_NULL] = "null",
6459         [EXEC_OUTPUT_TTY] = "tty",
6460         [EXEC_OUTPUT_KMSG] = "kmsg",
6461         [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
6462         [EXEC_OUTPUT_JOURNAL] = "journal",
6463         [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
6464         [EXEC_OUTPUT_SOCKET] = "socket",
6465         [EXEC_OUTPUT_NAMED_FD] = "fd",
6466         [EXEC_OUTPUT_FILE] = "file",
6467         [EXEC_OUTPUT_FILE_APPEND] = "append",
6468         [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
6469 };
6470
6471 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
6472
6473 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
6474         [EXEC_UTMP_INIT] = "init",
6475         [EXEC_UTMP_LOGIN] = "login",
6476         [EXEC_UTMP_USER] = "user",
6477 };
6478
6479 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
6480
6481 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
6482         [EXEC_PRESERVE_NO] = "no",
6483         [EXEC_PRESERVE_YES] = "yes",
6484         [EXEC_PRESERVE_RESTART] = "restart",
6485 };
6486
6487 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
6488
6489 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
6490 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
6491         [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
6492         [EXEC_DIRECTORY_STATE] = "StateDirectory",
6493         [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
6494         [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
6495         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
6496 };
6497
6498 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
6499
6500 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
6501  * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
6502  * directories, specifically .timer units with their timestamp touch file. */
6503 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
6504         [EXEC_DIRECTORY_RUNTIME] = "runtime",
6505         [EXEC_DIRECTORY_STATE] = "state",
6506         [EXEC_DIRECTORY_CACHE] = "cache",
6507         [EXEC_DIRECTORY_LOGS] = "logs",
6508         [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
6509 };
6510
6511 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
6512
6513 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
6514  * the service payload in. */
6515 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
6516         [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
6517         [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
6518         [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
6519         [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
6520         [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
6521 };
6522
6523 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
6524
6525 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
6526         [EXEC_KEYRING_INHERIT] = "inherit",
6527         [EXEC_KEYRING_PRIVATE] = "private",
6528         [EXEC_KEYRING_SHARED] = "shared",
6529 };
6530
6531 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);