src/core/execute.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <errno.h>
   4 #include <fcntl.h>
   5 #include <poll.h>
   6 #include <sys/eventfd.h>
   7 #include <sys/ioctl.h>
   8 #include <sys/mman.h>
   9 #include <sys/mount.h>
  10 #include <sys/personality.h>
  11 #include <sys/prctl.h>
  12 #include <sys/shm.h>
  13 #include <sys/types.h>
  14 #include <sys/un.h>
  15 #include <unistd.h>
  16 #include <utmpx.h>
  17
  18 #if HAVE_PAM
  19 #include <security/pam_appl.h>
  20 #endif
  21
  22 #if HAVE_SELINUX
  23 #include <selinux/selinux.h>
  24 #endif
  25
  26 #if HAVE_SECCOMP
  27 #include <seccomp.h>
  28 #endif
  29
  30 #if HAVE_APPARMOR
  31 #include <sys/apparmor.h>
  32 #endif
  33
  34 #include "sd-messages.h"
  35
  36 #include "acl-util.h"
  37 #include "af-list.h"
  38 #include "alloc-util.h"
  39 #if HAVE_APPARMOR
  40 #include "apparmor-util.h"
  41 #endif
  42 #include "async.h"
  43 #include "barrier.h"
  44 #include "cap-list.h"
  45 #include "capability-util.h"
  46 #include "cgroup-setup.h"
  47 #include "chown-recursive.h"
  48 #include "cpu-set-util.h"
  49 #include "def.h"
  50 #include "env-file.h"
  51 #include "env-util.h"
  52 #include "errno-list.h"
  53 #include "execute.h"
  54 #include "exit-status.h"
  55 #include "fd-util.h"
  56 #include "fileio.h"
  57 #include "format-util.h"
  58 #include "fs-util.h"
  59 #include "glob-util.h"
  60 #include "hexdecoct.h"
  61 #include "io-util.h"
  62 #include "ioprio.h"
  63 #include "label.h"
  64 #include "log.h"
  65 #include "macro.h"
  66 #include "manager.h"
  67 #include "memory-util.h"
  68 #include "missing_fs.h"
  69 #include "mkdir.h"
  70 #include "mount-util.h"
  71 #include "mountpoint-util.h"
  72 #include "namespace.h"
  73 #include "parse-util.h"
  74 #include "path-util.h"
  75 #include "process-util.h"
  76 #include "random-util.h"
  77 #include "rlimit-util.h"
  78 #include "rm-rf.h"
  79 #if HAVE_SECCOMP
  80 #include "seccomp-util.h"
  81 #endif
  82 #include "securebits-util.h"
  83 #include "selinux-util.h"
  84 #include "signal-util.h"
  85 #include "smack-util.h"
  86 #include "socket-util.h"
  87 #include "special.h"
  88 #include "stat-util.h"
  89 #include "string-table.h"
  90 #include "string-util.h"
  91 #include "strv.h"
  92 #include "syslog-util.h"
  93 #include "terminal-util.h"
  94 #include "tmpfile-util.h"
  95 #include "umask-util.h"
  96 #include "unit.h"
  97 #include "user-util.h"
  98 #include "utmp-wtmp.h"
  99
 100 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
 101 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
 102
 103 #define SNDBUF_SIZE (8*1024*1024)
 104
 105 static int shift_fds(int fds[], size_t n_fds) {
 106         if (n_fds <= 0)
 107                 return 0;
 108
 109         /* Modifies the fds array! (sorts it) */
 110
 111         assert(fds);
 112
 113         for (int start = 0;;) {
 114                 int restart_from = -1;
 115
 116                 for (int i = start; i < (int) n_fds; i++) {
 117                         int nfd;
 118
 119                         /* Already at right index? */
 120                         if (fds[i] == i+3)
 121                                 continue;
 122
 123                         nfd = fcntl(fds[i], F_DUPFD, i + 3);
 124                         if (nfd < 0)
 125                                 return -errno;
 126
 127                         safe_close(fds[i]);
 128                         fds[i] = nfd;
 129
 130                         /* Hmm, the fd we wanted isn't free? Then
 131                          * let's remember that and try again from here */
 132                         if (nfd != i+3 && restart_from < 0)
 133                                 restart_from = i;
 134                 }
 135
 136                 if (restart_from < 0)
 137                         break;
 138
 139                 start = restart_from;
 140         }
 141
 142         return 0;
 143 }
 144
 145 static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
 146         size_t n_fds;
 147         int r;
 148
 149         n_fds = n_socket_fds + n_storage_fds;
 150         if (n_fds <= 0)
 151                 return 0;
 152
 153         assert(fds);
 154
 155         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
 156          * O_NONBLOCK only applies to socket activation though. */
 157
 158         for (size_t i = 0; i < n_fds; i++) {
 159
 160                 if (i < n_socket_fds) {
 161                         r = fd_nonblock(fds[i], nonblock);
 162                         if (r < 0)
 163                                 return r;
 164                 }
 165
 166                 /* We unconditionally drop FD_CLOEXEC from the fds,
 167                  * since after all we want to pass these fds to our
 168                  * children */
 169
 170                 r = fd_cloexec(fds[i], false);
 171                 if (r < 0)
 172                         return r;
 173         }
 174
 175         return 0;
 176 }
 177
 178 static const char *exec_context_tty_path(const ExecContext *context) {
 179         assert(context);
 180
 181         if (context->stdio_as_fds)
 182                 return NULL;
 183
 184         if (context->tty_path)
 185                 return context->tty_path;
 186
 187         return "/dev/console";
 188 }
 189
 190 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
 191         const char *path;
 192
 193         assert(context);
 194
 195         path = exec_context_tty_path(context);
 196
 197         if (context->tty_vhangup) {
 198                 if (p && p->stdin_fd >= 0)
 199                         (void) terminal_vhangup_fd(p->stdin_fd);
 200                 else if (path)
 201                         (void) terminal_vhangup(path);
 202         }
 203
 204         if (context->tty_reset) {
 205                 if (p && p->stdin_fd >= 0)
 206                         (void) reset_terminal_fd(p->stdin_fd, true);
 207                 else if (path)
 208                         (void) reset_terminal(path);
 209         }
 210
 211         if (context->tty_vt_disallocate && path)
 212                 (void) vt_disallocate(path);
 213 }
 214
 215 static bool is_terminal_input(ExecInput i) {
 216         return IN_SET(i,
 217                       EXEC_INPUT_TTY,
 218                       EXEC_INPUT_TTY_FORCE,
 219                       EXEC_INPUT_TTY_FAIL);
 220 }
 221
 222 static bool is_terminal_output(ExecOutput o) {
 223         return IN_SET(o,
 224                       EXEC_OUTPUT_TTY,
 225                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 226                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 227 }
 228
 229 static bool is_kmsg_output(ExecOutput o) {
 230         return IN_SET(o,
 231                       EXEC_OUTPUT_KMSG,
 232                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 233 }
 234
 235 static bool exec_context_needs_term(const ExecContext *c) {
 236         assert(c);
 237
 238         /* Return true if the execution context suggests we should set $TERM to something useful. */
 239
 240         if (is_terminal_input(c->std_input))
 241                 return true;
 242
 243         if (is_terminal_output(c->std_output))
 244                 return true;
 245
 246         if (is_terminal_output(c->std_error))
 247                 return true;
 248
 249         return !!c->tty_path;
 250 }
 251
 252 static int open_null_as(int flags, int nfd) {
 253         int fd;
 254
 255         assert(nfd >= 0);
 256
 257         fd = open("/dev/null", flags|O_NOCTTY);
 258         if (fd < 0)
 259                 return -errno;
 260
 261         return move_fd(fd, nfd, false);
 262 }
 263
 264 static int connect_journal_socket(
 265                 int fd,
 266                 const char *log_namespace,
 267                 uid_t uid,
 268                 gid_t gid) {
 269
 270         union sockaddr_union sa;
 271         socklen_t sa_len;
 272         uid_t olduid = UID_INVALID;
 273         gid_t oldgid = GID_INVALID;
 274         const char *j;
 275         int r;
 276
 277         j = log_namespace ?
 278                 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
 279                 "/run/systemd/journal/stdout";
 280         r = sockaddr_un_set_path(&sa.un, j);
 281         if (r < 0)
 282                 return r;
 283         sa_len = r;
 284
 285         if (gid_is_valid(gid)) {
 286                 oldgid = getgid();
 287
 288                 if (setegid(gid) < 0)
 289                         return -errno;
 290         }
 291
 292         if (uid_is_valid(uid)) {
 293                 olduid = getuid();
 294
 295                 if (seteuid(uid) < 0) {
 296                         r = -errno;
 297                         goto restore_gid;
 298                 }
 299         }
 300
 301         r = connect(fd, &sa.sa, sa_len) < 0 ? -errno : 0;
 302
 303         /* If we fail to restore the uid or gid, things will likely
 304            fail later on. This should only happen if an LSM interferes. */
 305
 306         if (uid_is_valid(uid))
 307                 (void) seteuid(olduid);
 308
 309  restore_gid:
 310         if (gid_is_valid(gid))
 311                 (void) setegid(oldgid);
 312
 313         return r;
 314 }
 315
 316 static int connect_logger_as(
 317                 const Unit *unit,
 318                 const ExecContext *context,
 319                 const ExecParameters *params,
 320                 ExecOutput output,
 321                 const char *ident,
 322                 int nfd,
 323                 uid_t uid,
 324                 gid_t gid) {
 325
 326         _cleanup_close_ int fd = -1;
 327         int r;
 328
 329         assert(context);
 330         assert(params);
 331         assert(output < _EXEC_OUTPUT_MAX);
 332         assert(ident);
 333         assert(nfd >= 0);
 334
 335         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 336         if (fd < 0)
 337                 return -errno;
 338
 339         r = connect_journal_socket(fd, context->log_namespace, uid, gid);
 340         if (r < 0)
 341                 return r;
 342
 343         if (shutdown(fd, SHUT_RD) < 0)
 344                 return -errno;
 345
 346         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 347
 348         if (dprintf(fd,
 349                 "%s\n"
 350                 "%s\n"
 351                 "%i\n"
 352                 "%i\n"
 353                 "%i\n"
 354                 "%i\n"
 355                 "%i\n",
 356                 context->syslog_identifier ?: ident,
 357                 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
 358                 context->syslog_priority,
 359                 !!context->syslog_level_prefix,
 360                 false,
 361                 is_kmsg_output(output),
 362                 is_terminal_output(output)) < 0)
 363                 return -errno;
 364
 365         return move_fd(TAKE_FD(fd), nfd, false);
 366 }
 367
 368 static int open_terminal_as(const char *path, int flags, int nfd) {
 369         int fd;
 370
 371         assert(path);
 372         assert(nfd >= 0);
 373
 374         fd = open_terminal(path, flags | O_NOCTTY);
 375         if (fd < 0)
 376                 return fd;
 377
 378         return move_fd(fd, nfd, false);
 379 }
 380
 381 static int acquire_path(const char *path, int flags, mode_t mode) {
 382         union sockaddr_union sa;
 383         socklen_t sa_len;
 384         _cleanup_close_ int fd = -1;
 385         int r;
 386
 387         assert(path);
 388
 389         if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
 390                 flags |= O_CREAT;
 391
 392         fd = open(path, flags|O_NOCTTY, mode);
 393         if (fd >= 0)
 394                 return TAKE_FD(fd);
 395
 396         if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
 397                 return -errno;
 398
 399         /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
 400
 401         r = sockaddr_un_set_path(&sa.un, path);
 402         if (r < 0)
 403                 return r == -EINVAL ? -ENXIO : r;
 404         sa_len = r;
 405
 406         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 407         if (fd < 0)
 408                 return -errno;
 409
 410         if (connect(fd, &sa.sa, sa_len) < 0)
 411                 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
 412                                                            * indication that this wasn't an AF_UNIX socket after all */
 413
 414         if ((flags & O_ACCMODE) == O_RDONLY)
 415                 r = shutdown(fd, SHUT_WR);
 416         else if ((flags & O_ACCMODE) == O_WRONLY)
 417                 r = shutdown(fd, SHUT_RD);
 418         else
 419                 r = 0;
 420         if (r < 0)
 421                 return -errno;
 422
 423         return TAKE_FD(fd);
 424 }
 425
 426 static int fixup_input(
 427                 const ExecContext *context,
 428                 int socket_fd,
 429                 bool apply_tty_stdin) {
 430
 431         ExecInput std_input;
 432
 433         assert(context);
 434
 435         std_input = context->std_input;
 436
 437         if (is_terminal_input(std_input) && !apply_tty_stdin)
 438                 return EXEC_INPUT_NULL;
 439
 440         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 441                 return EXEC_INPUT_NULL;
 442
 443         if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
 444                 return EXEC_INPUT_NULL;
 445
 446         return std_input;
 447 }
 448
 449 static int fixup_output(ExecOutput std_output, int socket_fd) {
 450
 451         if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 452                 return EXEC_OUTPUT_INHERIT;
 453
 454         return std_output;
 455 }
 456
 457 static int setup_input(
 458                 const ExecContext *context,
 459                 const ExecParameters *params,
 460                 int socket_fd,
 461                 const int named_iofds[static 3]) {
 462
 463         ExecInput i;
 464
 465         assert(context);
 466         assert(params);
 467         assert(named_iofds);
 468
 469         if (params->stdin_fd >= 0) {
 470                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 471                         return -errno;
 472
 473                 /* Try to make this the controlling tty, if it is a tty, and reset it */
 474                 if (isatty(STDIN_FILENO)) {
 475                         (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 476                         (void) reset_terminal_fd(STDIN_FILENO, true);
 477                 }
 478
 479                 return STDIN_FILENO;
 480         }
 481
 482         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 483
 484         switch (i) {
 485
 486         case EXEC_INPUT_NULL:
 487                 return open_null_as(O_RDONLY, STDIN_FILENO);
 488
 489         case EXEC_INPUT_TTY:
 490         case EXEC_INPUT_TTY_FORCE:
 491         case EXEC_INPUT_TTY_FAIL: {
 492                 int fd;
 493
 494                 fd = acquire_terminal(exec_context_tty_path(context),
 495                                       i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
 496                                       i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
 497                                                                   ACQUIRE_TERMINAL_WAIT,
 498                                       USEC_INFINITY);
 499                 if (fd < 0)
 500                         return fd;
 501
 502                 return move_fd(fd, STDIN_FILENO, false);
 503         }
 504
 505         case EXEC_INPUT_SOCKET:
 506                 assert(socket_fd >= 0);
 507
 508                 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 509
 510         case EXEC_INPUT_NAMED_FD:
 511                 assert(named_iofds[STDIN_FILENO] >= 0);
 512
 513                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 514                 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 515
 516         case EXEC_INPUT_DATA: {
 517                 int fd;
 518
 519                 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
 520                 if (fd < 0)
 521                         return fd;
 522
 523                 return move_fd(fd, STDIN_FILENO, false);
 524         }
 525
 526         case EXEC_INPUT_FILE: {
 527                 bool rw;
 528                 int fd;
 529
 530                 assert(context->stdio_file[STDIN_FILENO]);
 531
 532                 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
 533                         (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
 534
 535                 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
 536                 if (fd < 0)
 537                         return fd;
 538
 539                 return move_fd(fd, STDIN_FILENO, false);
 540         }
 541
 542         default:
 543                 assert_not_reached("Unknown input type");
 544         }
 545 }
 546
 547 static bool can_inherit_stderr_from_stdout(
 548                 const ExecContext *context,
 549                 ExecOutput o,
 550                 ExecOutput e) {
 551
 552         assert(context);
 553
 554         /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
 555          * stderr fd */
 556
 557         if (e == EXEC_OUTPUT_INHERIT)
 558                 return true;
 559         if (e != o)
 560                 return false;
 561
 562         if (e == EXEC_OUTPUT_NAMED_FD)
 563                 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
 564
 565         if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND))
 566                 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
 567
 568         return true;
 569 }
 570
 571 static int setup_output(
 572                 const Unit *unit,
 573                 const ExecContext *context,
 574                 const ExecParameters *params,
 575                 int fileno,
 576                 int socket_fd,
 577                 const int named_iofds[static 3],
 578                 const char *ident,
 579                 uid_t uid,
 580                 gid_t gid,
 581                 dev_t *journal_stream_dev,
 582                 ino_t *journal_stream_ino) {
 583
 584         ExecOutput o;
 585         ExecInput i;
 586         int r;
 587
 588         assert(unit);
 589         assert(context);
 590         assert(params);
 591         assert(ident);
 592         assert(journal_stream_dev);
 593         assert(journal_stream_ino);
 594
 595         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 596
 597                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 598                         return -errno;
 599
 600                 return STDOUT_FILENO;
 601         }
 602
 603         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 604                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 605                         return -errno;
 606
 607                 return STDERR_FILENO;
 608         }
 609
 610         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 611         o = fixup_output(context->std_output, socket_fd);
 612
 613         if (fileno == STDERR_FILENO) {
 614                 ExecOutput e;
 615                 e = fixup_output(context->std_error, socket_fd);
 616
 617                 /* This expects the input and output are already set up */
 618
 619                 /* Don't change the stderr file descriptor if we inherit all
 620                  * the way and are not on a tty */
 621                 if (e == EXEC_OUTPUT_INHERIT &&
 622                     o == EXEC_OUTPUT_INHERIT &&
 623                     i == EXEC_INPUT_NULL &&
 624                     !is_terminal_input(context->std_input) &&
 625                     getppid () != 1)
 626                         return fileno;
 627
 628                 /* Duplicate from stdout if possible */
 629                 if (can_inherit_stderr_from_stdout(context, o, e))
 630                         return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
 631
 632                 o = e;
 633
 634         } else if (o == EXEC_OUTPUT_INHERIT) {
 635                 /* If input got downgraded, inherit the original value */
 636                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 637                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 638
 639                 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
 640                 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
 641                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 642
 643                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 644                 if (getppid() != 1)
 645                         return fileno;
 646
 647                 /* We need to open /dev/null here anew, to get the right access mode. */
 648                 return open_null_as(O_WRONLY, fileno);
 649         }
 650
 651         switch (o) {
 652
 653         case EXEC_OUTPUT_NULL:
 654                 return open_null_as(O_WRONLY, fileno);
 655
 656         case EXEC_OUTPUT_TTY:
 657                 if (is_terminal_input(i))
 658                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 659
 660                 /* We don't reset the terminal if this is just about output */
 661                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 662
 663         case EXEC_OUTPUT_KMSG:
 664         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 665         case EXEC_OUTPUT_JOURNAL:
 666         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 667                 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
 668                 if (r < 0) {
 669                         log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
 670                         r = open_null_as(O_WRONLY, fileno);
 671                 } else {
 672                         struct stat st;
 673
 674                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 675                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 676                          * services to detect whether they are connected to the journal or not.
 677                          *
 678                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
 679                          * about STDERR as that's usually the best way to do logging. */
 680
 681                         if (fstat(fileno, &st) >= 0 &&
 682                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
 683                                 *journal_stream_dev = st.st_dev;
 684                                 *journal_stream_ino = st.st_ino;
 685                         }
 686                 }
 687                 return r;
 688
 689         case EXEC_OUTPUT_SOCKET:
 690                 assert(socket_fd >= 0);
 691
 692                 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
 693
 694         case EXEC_OUTPUT_NAMED_FD:
 695                 assert(named_iofds[fileno] >= 0);
 696
 697                 (void) fd_nonblock(named_iofds[fileno], false);
 698                 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
 699
 700         case EXEC_OUTPUT_FILE:
 701         case EXEC_OUTPUT_FILE_APPEND: {
 702                 bool rw;
 703                 int fd, flags;
 704
 705                 assert(context->stdio_file[fileno]);
 706
 707                 rw = context->std_input == EXEC_INPUT_FILE &&
 708                         streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
 709
 710                 if (rw)
 711                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 712
 713                 flags = O_WRONLY;
 714                 if (o == EXEC_OUTPUT_FILE_APPEND)
 715                         flags |= O_APPEND;
 716
 717                 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
 718                 if (fd < 0)
 719                         return fd;
 720
 721                 return move_fd(fd, fileno, 0);
 722         }
 723
 724         default:
 725                 assert_not_reached("Unknown error type");
 726         }
 727 }
 728
 729 static int chown_terminal(int fd, uid_t uid) {
 730         int r;
 731
 732         assert(fd >= 0);
 733
 734         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 735         if (isatty(fd) < 1) {
 736                 if (IN_SET(errno, EINVAL, ENOTTY))
 737                         return 0; /* not a tty */
 738
 739                 return -errno;
 740         }
 741
 742         /* This might fail. What matters are the results. */
 743         r = fchmod_and_chown(fd, TTY_MODE, uid, -1);
 744         if (r < 0)
 745                 return r;
 746
 747         return 1;
 748 }
 749
 750 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
 751         _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
 752         int r;
 753
 754         assert(_saved_stdin);
 755         assert(_saved_stdout);
 756
 757         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 758         if (saved_stdin < 0)
 759                 return -errno;
 760
 761         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
 762         if (saved_stdout < 0)
 763                 return -errno;
 764
 765         fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
 766         if (fd < 0)
 767                 return fd;
 768
 769         r = chown_terminal(fd, getuid());
 770         if (r < 0)
 771                 return r;
 772
 773         r = reset_terminal_fd(fd, true);
 774         if (r < 0)
 775                 return r;
 776
 777         r = rearrange_stdio(fd, fd, STDERR_FILENO);
 778         fd = -1;
 779         if (r < 0)
 780                 return r;
 781
 782         *_saved_stdin = saved_stdin;
 783         *_saved_stdout = saved_stdout;
 784
 785         saved_stdin = saved_stdout = -1;
 786
 787         return 0;
 788 }
 789
 790 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
 791         assert(err < 0);
 792
 793         if (err == -ETIMEDOUT)
 794                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
 795         else {
 796                 errno = -err;
 797                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
 798         }
 799 }
 800
 801 static void write_confirm_error(int err, const char *vc, const Unit *u) {
 802         _cleanup_close_ int fd = -1;
 803
 804         assert(vc);
 805
 806         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 807         if (fd < 0)
 808                 return;
 809
 810         write_confirm_error_fd(err, fd, u);
 811 }
 812
 813 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 814         int r = 0;
 815
 816         assert(saved_stdin);
 817         assert(saved_stdout);
 818
 819         release_terminal();
 820
 821         if (*saved_stdin >= 0)
 822                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 823                         r = -errno;
 824
 825         if (*saved_stdout >= 0)
 826                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 827                         r = -errno;
 828
 829         *saved_stdin = safe_close(*saved_stdin);
 830         *saved_stdout = safe_close(*saved_stdout);
 831
 832         return r;
 833 }
 834
 835 enum {
 836         CONFIRM_PRETEND_FAILURE = -1,
 837         CONFIRM_PRETEND_SUCCESS =  0,
 838         CONFIRM_EXECUTE = 1,
 839 };
 840
 841 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
 842         int saved_stdout = -1, saved_stdin = -1, r;
 843         _cleanup_free_ char *e = NULL;
 844         char c;
 845
 846         /* For any internal errors, assume a positive response. */
 847         r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
 848         if (r < 0) {
 849                 write_confirm_error(r, vc, u);
 850                 return CONFIRM_EXECUTE;
 851         }
 852
 853         /* confirm_spawn might have been disabled while we were sleeping. */
 854         if (manager_is_confirm_spawn_disabled(u->manager)) {
 855                 r = 1;
 856                 goto restore_stdio;
 857         }
 858
 859         e = ellipsize(cmdline, 60, 100);
 860         if (!e) {
 861                 log_oom();
 862                 r = CONFIRM_EXECUTE;
 863                 goto restore_stdio;
 864         }
 865
 866         for (;;) {
 867                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 868                 if (r < 0) {
 869                         write_confirm_error_fd(r, STDOUT_FILENO, u);
 870                         r = CONFIRM_EXECUTE;
 871                         goto restore_stdio;
 872                 }
 873
 874                 switch (c) {
 875                 case 'c':
 876                         printf("Resuming normal execution.\n");
 877                         manager_disable_confirm_spawn();
 878                         r = 1;
 879                         break;
 880                 case 'D':
 881                         unit_dump(u, stdout, "  ");
 882                         continue; /* ask again */
 883                 case 'f':
 884                         printf("Failing execution.\n");
 885                         r = CONFIRM_PRETEND_FAILURE;
 886                         break;
 887                 case 'h':
 888                         printf("  c - continue, proceed without asking anymore\n"
 889                                "  D - dump, show the state of the unit\n"
 890                                "  f - fail, don't execute the command and pretend it failed\n"
 891                                "  h - help\n"
 892                                "  i - info, show a short summary of the unit\n"
 893                                "  j - jobs, show jobs that are in progress\n"
 894                                "  s - skip, don't execute the command and pretend it succeeded\n"
 895                                "  y - yes, execute the command\n");
 896                         continue; /* ask again */
 897                 case 'i':
 898                         printf("  Description: %s\n"
 899                                "  Unit:        %s\n"
 900                                "  Command:     %s\n",
 901                                u->id, u->description, cmdline);
 902                         continue; /* ask again */
 903                 case 'j':
 904                         manager_dump_jobs(u->manager, stdout, "  ");
 905                         continue; /* ask again */
 906                 case 'n':
 907                         /* 'n' was removed in favor of 'f'. */
 908                         printf("Didn't understand 'n', did you mean 'f'?\n");
 909                         continue; /* ask again */
 910                 case 's':
 911                         printf("Skipping execution.\n");
 912                         r = CONFIRM_PRETEND_SUCCESS;
 913                         break;
 914                 case 'y':
 915                         r = CONFIRM_EXECUTE;
 916                         break;
 917                 default:
 918                         assert_not_reached("Unhandled choice");
 919                 }
 920                 break;
 921         }
 922
 923 restore_stdio:
 924         restore_confirm_stdio(&saved_stdin, &saved_stdout);
 925         return r;
 926 }
 927
 928 static int get_fixed_user(const ExecContext *c, const char **user,
 929                           uid_t *uid, gid_t *gid,
 930                           const char **home, const char **shell) {
 931         int r;
 932         const char *name;
 933
 934         assert(c);
 935
 936         if (!c->user)
 937                 return 0;
 938
 939         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
 940          * (i.e. are "/" or "/bin/nologin"). */
 941
 942         name = c->user;
 943         r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
 944         if (r < 0)
 945                 return r;
 946
 947         *user = name;
 948         return 0;
 949 }
 950
 951 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
 952         int r;
 953         const char *name;
 954
 955         assert(c);
 956
 957         if (!c->group)
 958                 return 0;
 959
 960         name = c->group;
 961         r = get_group_creds(&name, gid, 0);
 962         if (r < 0)
 963                 return r;
 964
 965         *group = name;
 966         return 0;
 967 }
 968
 969 static int get_supplementary_groups(const ExecContext *c, const char *user,
 970                                     const char *group, gid_t gid,
 971                                     gid_t **supplementary_gids, int *ngids) {
 972         char **i;
 973         int r, k = 0;
 974         int ngroups_max;
 975         bool keep_groups = false;
 976         gid_t *groups = NULL;
 977         _cleanup_free_ gid_t *l_gids = NULL;
 978
 979         assert(c);
 980
 981         /*
 982          * If user is given, then lookup GID and supplementary groups list.
 983          * We avoid NSS lookups for gid=0. Also we have to initialize groups
 984          * here and as early as possible so we keep the list of supplementary
 985          * groups of the caller.
 986          */
 987         if (user && gid_is_valid(gid) && gid != 0) {
 988                 /* First step, initialize groups from /etc/groups */
 989                 if (initgroups(user, gid) < 0)
 990                         return -errno;
 991
 992                 keep_groups = true;
 993         }
 994
 995         if (strv_isempty(c->supplementary_groups))
 996                 return 0;
 997
 998         /*
 999          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1000          * be positive, otherwise fail.
1001          */
1002         errno = 0;
1003         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1004         if (ngroups_max <= 0)
1005                 return errno_or_else(EOPNOTSUPP);
1006
1007         l_gids = new(gid_t, ngroups_max);
1008         if (!l_gids)
1009                 return -ENOMEM;
1010
1011         if (keep_groups) {
1012                 /*
1013                  * Lookup the list of groups that the user belongs to, we
1014                  * avoid NSS lookups here too for gid=0.
1015                  */
1016                 k = ngroups_max;
1017                 if (getgrouplist(user, gid, l_gids, &k) < 0)
1018                         return -EINVAL;
1019         } else
1020                 k = 0;
1021
1022         STRV_FOREACH(i, c->supplementary_groups) {
1023                 const char *g;
1024
1025                 if (k >= ngroups_max)
1026                         return -E2BIG;
1027
1028                 g = *i;
1029                 r = get_group_creds(&g, l_gids+k, 0);
1030                 if (r < 0)
1031                         return r;
1032
1033                 k++;
1034         }
1035
1036         /*
1037          * Sets ngids to zero to drop all supplementary groups, happens
1038          * when we are under root and SupplementaryGroups= is empty.
1039          */
1040         if (k == 0) {
1041                 *ngids = 0;
1042                 return 0;
1043         }
1044
1045         /* Otherwise get the final list of supplementary groups */
1046         groups = memdup(l_gids, sizeof(gid_t) * k);
1047         if (!groups)
1048                 return -ENOMEM;
1049
1050         *supplementary_gids = groups;
1051         *ngids = k;
1052
1053         groups = NULL;
1054
1055         return 0;
1056 }
1057
1058 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1059         int r;
1060
1061         /* Handle SupplementaryGroups= if it is not empty */
1062         if (ngids > 0) {
1063                 r = maybe_setgroups(ngids, supplementary_gids);
1064                 if (r < 0)
1065                         return r;
1066         }
1067
1068         if (gid_is_valid(gid)) {
1069                 /* Then set our gids */
1070                 if (setresgid(gid, gid, gid) < 0)
1071                         return -errno;
1072         }
1073
1074         return 0;
1075 }
1076
1077 static int set_securebits(int bits, int mask) {
1078         int current, applied;
1079         current = prctl(PR_GET_SECUREBITS);
1080         if (current < 0)
1081                 return -errno;
1082         /* Clear all securebits defined in mask and set bits */
1083         applied = (current & ~mask) | bits;
1084         if (current == applied)
1085                 return 0;
1086         if (prctl(PR_SET_SECUREBITS, applied) < 0)
1087                 return -errno;
1088         return 1;
1089 }
1090
1091 static int enforce_user(const ExecContext *context, uid_t uid) {
1092         assert(context);
1093         int r;
1094
1095         if (!uid_is_valid(uid))
1096                 return 0;
1097
1098         /* Sets (but doesn't look up) the uid and make sure we keep the
1099          * capabilities while doing so. For setting secure bits the capability CAP_SETPCAP is
1100          * required, so we also need keep-caps in this case.
1101          */
1102
1103         if (context->capability_ambient_set != 0 || context->secure_bits != 0) {
1104
1105                 /* First step: If we need to keep capabilities but
1106                  * drop privileges we need to make sure we keep our
1107                  * caps, while we drop privileges. */
1108                 if (uid != 0) {
1109                         /* Add KEEP_CAPS to the securebits */
1110                         r = set_securebits(1<<SECURE_KEEP_CAPS, 0);
1111                         if (r < 0)
1112                                 return r;
1113                 }
1114         }
1115
1116         /* Second step: actually set the uids */
1117         if (setresuid(uid, uid, uid) < 0)
1118                 return -errno;
1119
1120         /* At this point we should have all necessary capabilities but
1121            are otherwise a normal user. However, the caps might got
1122            corrupted due to the setresuid() so we need clean them up
1123            later. This is done outside of this call. */
1124
1125         return 0;
1126 }
1127
1128 #if HAVE_PAM
1129
1130 static int null_conv(
1131                 int num_msg,
1132                 const struct pam_message **msg,
1133                 struct pam_response **resp,
1134                 void *appdata_ptr) {
1135
1136         /* We don't support conversations */
1137
1138         return PAM_CONV_ERR;
1139 }
1140
1141 #endif
1142
1143 static int setup_pam(
1144                 const char *name,
1145                 const char *user,
1146                 uid_t uid,
1147                 gid_t gid,
1148                 const char *tty,
1149                 char ***env,
1150                 const int fds[], size_t n_fds) {
1151
1152 #if HAVE_PAM
1153
1154         static const struct pam_conv conv = {
1155                 .conv = null_conv,
1156                 .appdata_ptr = NULL
1157         };
1158
1159         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1160         pam_handle_t *handle = NULL;
1161         sigset_t old_ss;
1162         int pam_code = PAM_SUCCESS, r;
1163         char **nv, **e = NULL;
1164         bool close_session = false;
1165         pid_t pam_pid = 0, parent_pid;
1166         int flags = 0;
1167
1168         assert(name);
1169         assert(user);
1170         assert(env);
1171
1172         /* We set up PAM in the parent process, then fork. The child
1173          * will then stay around until killed via PR_GET_PDEATHSIG or
1174          * systemd via the cgroup logic. It will then remove the PAM
1175          * session again. The parent process will exec() the actual
1176          * daemon. We do things this way to ensure that the main PID
1177          * of the daemon is the one we initially fork()ed. */
1178
1179         r = barrier_create(&barrier);
1180         if (r < 0)
1181                 goto fail;
1182
1183         if (log_get_max_level() < LOG_DEBUG)
1184                 flags |= PAM_SILENT;
1185
1186         pam_code = pam_start(name, user, &conv, &handle);
1187         if (pam_code != PAM_SUCCESS) {
1188                 handle = NULL;
1189                 goto fail;
1190         }
1191
1192         if (!tty) {
1193                 _cleanup_free_ char *q = NULL;
1194
1195                 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1196                  * out if that's the case, and read the TTY off it. */
1197
1198                 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1199                         tty = strjoina("/dev/", q);
1200         }
1201
1202         if (tty) {
1203                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1204                 if (pam_code != PAM_SUCCESS)
1205                         goto fail;
1206         }
1207
1208         STRV_FOREACH(nv, *env) {
1209                 pam_code = pam_putenv(handle, *nv);
1210                 if (pam_code != PAM_SUCCESS)
1211                         goto fail;
1212         }
1213
1214         pam_code = pam_acct_mgmt(handle, flags);
1215         if (pam_code != PAM_SUCCESS)
1216                 goto fail;
1217
1218         pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1219         if (pam_code != PAM_SUCCESS)
1220                 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
1221
1222         pam_code = pam_open_session(handle, flags);
1223         if (pam_code != PAM_SUCCESS)
1224                 goto fail;
1225
1226         close_session = true;
1227
1228         e = pam_getenvlist(handle);
1229         if (!e) {
1230                 pam_code = PAM_BUF_ERR;
1231                 goto fail;
1232         }
1233
1234         /* Block SIGTERM, so that we know that it won't get lost in
1235          * the child */
1236
1237         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1238
1239         parent_pid = getpid_cached();
1240
1241         r = safe_fork("(sd-pam)", 0, &pam_pid);
1242         if (r < 0)
1243                 goto fail;
1244         if (r == 0) {
1245                 int sig, ret = EXIT_PAM;
1246
1247                 /* The child's job is to reset the PAM session on
1248                  * termination */
1249                 barrier_set_role(&barrier, BARRIER_CHILD);
1250
1251                 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1252                  * those fds are open here that have been opened by PAM. */
1253                 (void) close_many(fds, n_fds);
1254
1255                 /* Drop privileges - we don't need any to pam_close_session
1256                  * and this will make PR_SET_PDEATHSIG work in most cases.
1257                  * If this fails, ignore the error - but expect sd-pam threads
1258                  * to fail to exit normally */
1259
1260                 r = maybe_setgroups(0, NULL);
1261                 if (r < 0)
1262                         log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1263                 if (setresgid(gid, gid, gid) < 0)
1264                         log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1265                 if (setresuid(uid, uid, uid) < 0)
1266                         log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1267
1268                 (void) ignore_signals(SIGPIPE, -1);
1269
1270                 /* Wait until our parent died. This will only work if
1271                  * the above setresuid() succeeds, otherwise the kernel
1272                  * will not allow unprivileged parents kill their privileged
1273                  * children this way. We rely on the control groups kill logic
1274                  * to do the rest for us. */
1275                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1276                         goto child_finish;
1277
1278                 /* Tell the parent that our setup is done. This is especially
1279                  * important regarding dropping privileges. Otherwise, unit
1280                  * setup might race against our setresuid(2) call.
1281                  *
1282                  * If the parent aborted, we'll detect this below, hence ignore
1283                  * return failure here. */
1284                 (void) barrier_place(&barrier);
1285
1286                 /* Check if our parent process might already have died? */
1287                 if (getppid() == parent_pid) {
1288                         sigset_t ss;
1289
1290                         assert_se(sigemptyset(&ss) >= 0);
1291                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1292
1293                         for (;;) {
1294                                 if (sigwait(&ss, &sig) < 0) {
1295                                         if (errno == EINTR)
1296                                                 continue;
1297
1298                                         goto child_finish;
1299                                 }
1300
1301                                 assert(sig == SIGTERM);
1302                                 break;
1303                         }
1304                 }
1305
1306                 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1307                 if (pam_code != PAM_SUCCESS)
1308                         goto child_finish;
1309
1310                 /* If our parent died we'll end the session */
1311                 if (getppid() != parent_pid) {
1312                         pam_code = pam_close_session(handle, flags);
1313                         if (pam_code != PAM_SUCCESS)
1314                                 goto child_finish;
1315                 }
1316
1317                 ret = 0;
1318
1319         child_finish:
1320                 pam_end(handle, pam_code | flags);
1321                 _exit(ret);
1322         }
1323
1324         barrier_set_role(&barrier, BARRIER_PARENT);
1325
1326         /* If the child was forked off successfully it will do all the
1327          * cleanups, so forget about the handle here. */
1328         handle = NULL;
1329
1330         /* Unblock SIGTERM again in the parent */
1331         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1332
1333         /* We close the log explicitly here, since the PAM modules
1334          * might have opened it, but we don't want this fd around. */
1335         closelog();
1336
1337         /* Synchronously wait for the child to initialize. We don't care for
1338          * errors as we cannot recover. However, warn loudly if it happens. */
1339         if (!barrier_place_and_sync(&barrier))
1340                 log_error("PAM initialization failed");
1341
1342         return strv_free_and_replace(*env, e);
1343
1344 fail:
1345         if (pam_code != PAM_SUCCESS) {
1346                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1347                 r = -EPERM;  /* PAM errors do not map to errno */
1348         } else
1349                 log_error_errno(r, "PAM failed: %m");
1350
1351         if (handle) {
1352                 if (close_session)
1353                         pam_code = pam_close_session(handle, flags);
1354
1355                 pam_end(handle, pam_code | flags);
1356         }
1357
1358         strv_free(e);
1359         closelog();
1360
1361         return r;
1362 #else
1363         return 0;
1364 #endif
1365 }
1366
1367 static void rename_process_from_path(const char *path) {
1368         char process_name[11];
1369         const char *p;
1370         size_t l;
1371
1372         /* This resulting string must fit in 10 chars (i.e. the length
1373          * of "/sbin/init") to look pretty in /bin/ps */
1374
1375         p = basename(path);
1376         if (isempty(p)) {
1377                 rename_process("(...)");
1378                 return;
1379         }
1380
1381         l = strlen(p);
1382         if (l > 8) {
1383                 /* The end of the process name is usually more
1384                  * interesting, since the first bit might just be
1385                  * "systemd-" */
1386                 p = p + l - 8;
1387                 l = 8;
1388         }
1389
1390         process_name[0] = '(';
1391         memcpy(process_name+1, p, l);
1392         process_name[1+l] = ')';
1393         process_name[1+l+1] = 0;
1394
1395         rename_process(process_name);
1396 }
1397
1398 static bool context_has_address_families(const ExecContext *c) {
1399         assert(c);
1400
1401         return c->address_families_allow_list ||
1402                 !set_isempty(c->address_families);
1403 }
1404
1405 static bool context_has_syscall_filters(const ExecContext *c) {
1406         assert(c);
1407
1408         return c->syscall_allow_list ||
1409                 !hashmap_isempty(c->syscall_filter);
1410 }
1411
1412 static bool context_has_syscall_logs(const ExecContext *c) {
1413         assert(c);
1414
1415         return c->syscall_log_allow_list ||
1416                 !hashmap_isempty(c->syscall_log);
1417 }
1418
1419 static bool context_has_no_new_privileges(const ExecContext *c) {
1420         assert(c);
1421
1422         if (c->no_new_privileges)
1423                 return true;
1424
1425         if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1426                 return false;
1427
1428         /* We need NNP if we have any form of seccomp and are unprivileged */
1429         return context_has_address_families(c) ||
1430                 c->memory_deny_write_execute ||
1431                 c->restrict_realtime ||
1432                 c->restrict_suid_sgid ||
1433                 exec_context_restrict_namespaces_set(c) ||
1434                 c->protect_clock ||
1435                 c->protect_kernel_tunables ||
1436                 c->protect_kernel_modules ||
1437                 c->protect_kernel_logs ||
1438                 c->private_devices ||
1439                 context_has_syscall_filters(c) ||
1440                 context_has_syscall_logs(c) ||
1441                 !set_isempty(c->syscall_archs) ||
1442                 c->lock_personality ||
1443                 c->protect_hostname;
1444 }
1445
1446 static bool exec_context_has_credentials(const ExecContext *context) {
1447
1448         assert(context);
1449
1450         return !hashmap_isempty(context->set_credentials) ||
1451                 context->load_credentials;
1452 }
1453
1454 #if HAVE_SECCOMP
1455
1456 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1457
1458         if (is_seccomp_available())
1459                 return false;
1460
1461         log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1462         return true;
1463 }
1464
1465 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1466         uint32_t negative_action, default_action, action;
1467         int r;
1468
1469         assert(u);
1470         assert(c);
1471
1472         if (!context_has_syscall_filters(c))
1473                 return 0;
1474
1475         if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1476                 return 0;
1477
1478         negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1479
1480         if (c->syscall_allow_list) {
1481                 default_action = negative_action;
1482                 action = SCMP_ACT_ALLOW;
1483         } else {
1484                 default_action = SCMP_ACT_ALLOW;
1485                 action = negative_action;
1486         }
1487
1488         if (needs_ambient_hack) {
1489                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1490                 if (r < 0)
1491                         return r;
1492         }
1493
1494         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1495 }
1496
1497 static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1498 #ifdef SCMP_ACT_LOG
1499         uint32_t default_action, action;
1500 #endif
1501
1502         assert(u);
1503         assert(c);
1504
1505         if (!context_has_syscall_logs(c))
1506                 return 0;
1507
1508 #ifdef SCMP_ACT_LOG
1509         if (skip_seccomp_unavailable(u, "SystemCallLog="))
1510                 return 0;
1511
1512         if (c->syscall_log_allow_list) {
1513                 /* Log nothing but the ones listed */
1514                 default_action = SCMP_ACT_ALLOW;
1515                 action = SCMP_ACT_LOG;
1516         } else {
1517                 /* Log everything but the ones listed */
1518                 default_action = SCMP_ACT_LOG;
1519                 action = SCMP_ACT_ALLOW;
1520         }
1521
1522         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1523 #else
1524         /* old libseccomp */
1525         log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1526         return 0;
1527 #endif
1528 }
1529
1530 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1531         assert(u);
1532         assert(c);
1533
1534         if (set_isempty(c->syscall_archs))
1535                 return 0;
1536
1537         if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1538                 return 0;
1539
1540         return seccomp_restrict_archs(c->syscall_archs);
1541 }
1542
1543 static int apply_address_families(const Unit* u, const ExecContext *c) {
1544         assert(u);
1545         assert(c);
1546
1547         if (!context_has_address_families(c))
1548                 return 0;
1549
1550         if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1551                 return 0;
1552
1553         return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1554 }
1555
1556 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1557         assert(u);
1558         assert(c);
1559
1560         if (!c->memory_deny_write_execute)
1561                 return 0;
1562
1563         if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1564                 return 0;
1565
1566         return seccomp_memory_deny_write_execute();
1567 }
1568
1569 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1570         assert(u);
1571         assert(c);
1572
1573         if (!c->restrict_realtime)
1574                 return 0;
1575
1576         if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1577                 return 0;
1578
1579         return seccomp_restrict_realtime();
1580 }
1581
1582 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1583         assert(u);
1584         assert(c);
1585
1586         if (!c->restrict_suid_sgid)
1587                 return 0;
1588
1589         if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1590                 return 0;
1591
1592         return seccomp_restrict_suid_sgid();
1593 }
1594
1595 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1596         assert(u);
1597         assert(c);
1598
1599         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1600          * let's protect even those systems where this is left on in the kernel. */
1601
1602         if (!c->protect_kernel_tunables)
1603                 return 0;
1604
1605         if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1606                 return 0;
1607
1608         return seccomp_protect_sysctl();
1609 }
1610
1611 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1612         assert(u);
1613         assert(c);
1614
1615         /* Turn off module syscalls on ProtectKernelModules=yes */
1616
1617         if (!c->protect_kernel_modules)
1618                 return 0;
1619
1620         if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1621                 return 0;
1622
1623         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1624 }
1625
1626 static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1627         assert(u);
1628         assert(c);
1629
1630         if (!c->protect_kernel_logs)
1631                 return 0;
1632
1633         if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1634                 return 0;
1635
1636         return seccomp_protect_syslog();
1637 }
1638
1639 static int apply_protect_clock(const Unit *u, const ExecContext *c) {
1640         assert(u);
1641         assert(c);
1642
1643         if (!c->protect_clock)
1644                 return 0;
1645
1646         if (skip_seccomp_unavailable(u, "ProtectClock="))
1647                 return 0;
1648
1649         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1650 }
1651
1652 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1653         assert(u);
1654         assert(c);
1655
1656         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1657
1658         if (!c->private_devices)
1659                 return 0;
1660
1661         if (skip_seccomp_unavailable(u, "PrivateDevices="))
1662                 return 0;
1663
1664         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1665 }
1666
1667 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1668         assert(u);
1669         assert(c);
1670
1671         if (!exec_context_restrict_namespaces_set(c))
1672                 return 0;
1673
1674         if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1675                 return 0;
1676
1677         return seccomp_restrict_namespaces(c->restrict_namespaces);
1678 }
1679
1680 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1681         unsigned long personality;
1682         int r;
1683
1684         assert(u);
1685         assert(c);
1686
1687         if (!c->lock_personality)
1688                 return 0;
1689
1690         if (skip_seccomp_unavailable(u, "LockPersonality="))
1691                 return 0;
1692
1693         personality = c->personality;
1694
1695         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1696         if (personality == PERSONALITY_INVALID) {
1697
1698                 r = opinionated_personality(&personality);
1699                 if (r < 0)
1700                         return r;
1701         }
1702
1703         return seccomp_lock_personality(personality);
1704 }
1705
1706 #endif
1707
1708 static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
1709         assert(u);
1710         assert(c);
1711
1712         if (!c->protect_hostname)
1713                 return 0;
1714
1715         if (ns_type_supported(NAMESPACE_UTS)) {
1716                 if (unshare(CLONE_NEWUTS) < 0) {
1717                         if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1718                                 *ret_exit_status = EXIT_NAMESPACE;
1719                                 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1720                         }
1721
1722                         log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1723                 }
1724         } else
1725                 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1726
1727 #if HAVE_SECCOMP
1728         int r;
1729
1730         if (skip_seccomp_unavailable(u, "ProtectHostname="))
1731                 return 0;
1732
1733         r = seccomp_protect_hostname();
1734         if (r < 0) {
1735                 *ret_exit_status = EXIT_SECCOMP;
1736                 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1737         }
1738 #endif
1739
1740         return 0;
1741 }
1742
1743 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1744         assert(idle_pipe);
1745
1746         idle_pipe[1] = safe_close(idle_pipe[1]);
1747         idle_pipe[2] = safe_close(idle_pipe[2]);
1748
1749         if (idle_pipe[0] >= 0) {
1750                 int r;
1751
1752                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1753
1754                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1755                         ssize_t n;
1756
1757                         /* Signal systemd that we are bored and want to continue. */
1758                         n = write(idle_pipe[3], "x", 1);
1759                         if (n > 0)
1760                                 /* Wait for systemd to react to the signal above. */
1761                                 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1762                 }
1763
1764                 idle_pipe[0] = safe_close(idle_pipe[0]);
1765
1766         }
1767
1768         idle_pipe[3] = safe_close(idle_pipe[3]);
1769 }
1770
1771 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1772
1773 static int build_environment(
1774                 const Unit *u,
1775                 const ExecContext *c,
1776                 const ExecParameters *p,
1777                 size_t n_fds,
1778                 const char *home,
1779                 const char *username,
1780                 const char *shell,
1781                 dev_t journal_stream_dev,
1782                 ino_t journal_stream_ino,
1783                 char ***ret) {
1784
1785         _cleanup_strv_free_ char **our_env = NULL;
1786         size_t n_env = 0;
1787         char *x;
1788
1789         assert(u);
1790         assert(c);
1791         assert(p);
1792         assert(ret);
1793
1794 #define N_ENV_VARS 16
1795         our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1796         if (!our_env)
1797                 return -ENOMEM;
1798
1799         if (n_fds > 0) {
1800                 _cleanup_free_ char *joined = NULL;
1801
1802                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1803                         return -ENOMEM;
1804                 our_env[n_env++] = x;
1805
1806                 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1807                         return -ENOMEM;
1808                 our_env[n_env++] = x;
1809
1810                 joined = strv_join(p->fd_names, ":");
1811                 if (!joined)
1812                         return -ENOMEM;
1813
1814                 x = strjoin("LISTEN_FDNAMES=", joined);
1815                 if (!x)
1816                         return -ENOMEM;
1817                 our_env[n_env++] = x;
1818         }
1819
1820         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1821                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1822                         return -ENOMEM;
1823                 our_env[n_env++] = x;
1824
1825                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1826                         return -ENOMEM;
1827                 our_env[n_env++] = x;
1828         }
1829
1830         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1831          * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1832          * check the database directly. */
1833         if (p->flags & EXEC_NSS_BYPASS_BUS) {
1834                 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1835                 if (!x)
1836                         return -ENOMEM;
1837                 our_env[n_env++] = x;
1838         }
1839
1840         if (home) {
1841                 x = strjoin("HOME=", home);
1842                 if (!x)
1843                         return -ENOMEM;
1844
1845                 path_simplify(x + 5, true);
1846                 our_env[n_env++] = x;
1847         }
1848
1849         if (username) {
1850                 x = strjoin("LOGNAME=", username);
1851                 if (!x)
1852                         return -ENOMEM;
1853                 our_env[n_env++] = x;
1854
1855                 x = strjoin("USER=", username);
1856                 if (!x)
1857                         return -ENOMEM;
1858                 our_env[n_env++] = x;
1859         }
1860
1861         if (shell) {
1862                 x = strjoin("SHELL=", shell);
1863                 if (!x)
1864                         return -ENOMEM;
1865
1866                 path_simplify(x + 6, true);
1867                 our_env[n_env++] = x;
1868         }
1869
1870         if (!sd_id128_is_null(u->invocation_id)) {
1871                 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1872                         return -ENOMEM;
1873
1874                 our_env[n_env++] = x;
1875         }
1876
1877         if (exec_context_needs_term(c)) {
1878                 const char *tty_path, *term = NULL;
1879
1880                 tty_path = exec_context_tty_path(c);
1881
1882                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1883                  * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1884                  * container manager passes to PID 1 ends up all the way in the console login shown. */
1885
1886                 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
1887                         term = getenv("TERM");
1888
1889                 if (!term)
1890                         term = default_term_for_tty(tty_path);
1891
1892                 x = strjoin("TERM=", term);
1893                 if (!x)
1894                         return -ENOMEM;
1895                 our_env[n_env++] = x;
1896         }
1897
1898         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1899                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1900                         return -ENOMEM;
1901
1902                 our_env[n_env++] = x;
1903         }
1904
1905         if (c->log_namespace) {
1906                 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1907                 if (!x)
1908                         return -ENOMEM;
1909
1910                 our_env[n_env++] = x;
1911         }
1912
1913         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1914                 _cleanup_free_ char *pre = NULL, *joined = NULL;
1915                 const char *n;
1916
1917                 if (!p->prefix[t])
1918                         continue;
1919
1920                 if (strv_isempty(c->directories[t].paths))
1921                         continue;
1922
1923                 n = exec_directory_env_name_to_string(t);
1924                 if (!n)
1925                         continue;
1926
1927                 pre = strjoin(p->prefix[t], "/");
1928                 if (!pre)
1929                         return -ENOMEM;
1930
1931                 joined = strv_join_full(c->directories[t].paths, ":", pre, true);
1932                 if (!joined)
1933                         return -ENOMEM;
1934
1935                 x = strjoin(n, "=", joined);
1936                 if (!x)
1937                         return -ENOMEM;
1938
1939                 our_env[n_env++] = x;
1940         }
1941
1942         if (exec_context_has_credentials(c) && p->prefix[EXEC_DIRECTORY_RUNTIME]) {
1943                 x = strjoin("CREDENTIALS_DIRECTORY=", p->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
1944                 if (!x)
1945                         return -ENOMEM;
1946
1947                 our_env[n_env++] = x;
1948         }
1949
1950         our_env[n_env++] = NULL;
1951         assert(n_env <= N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1952 #undef N_ENV_VARS
1953
1954         *ret = TAKE_PTR(our_env);
1955
1956         return 0;
1957 }
1958
1959 static int build_pass_environment(const ExecContext *c, char ***ret) {
1960         _cleanup_strv_free_ char **pass_env = NULL;
1961         size_t n_env = 0, n_bufsize = 0;
1962         char **i;
1963
1964         STRV_FOREACH(i, c->pass_environment) {
1965                 _cleanup_free_ char *x = NULL;
1966                 char *v;
1967
1968                 v = getenv(*i);
1969                 if (!v)
1970                         continue;
1971                 x = strjoin(*i, "=", v);
1972                 if (!x)
1973                         return -ENOMEM;
1974
1975                 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1976                         return -ENOMEM;
1977
1978                 pass_env[n_env++] = TAKE_PTR(x);
1979                 pass_env[n_env] = NULL;
1980         }
1981
1982         *ret = TAKE_PTR(pass_env);
1983
1984         return 0;
1985 }
1986
1987 static bool exec_needs_mount_namespace(
1988                 const ExecContext *context,
1989                 const ExecParameters *params,
1990                 const ExecRuntime *runtime) {
1991
1992         assert(context);
1993         assert(params);
1994
1995         if (context->root_image)
1996                 return true;
1997
1998         if (!strv_isempty(context->read_write_paths) ||
1999             !strv_isempty(context->read_only_paths) ||
2000             !strv_isempty(context->inaccessible_paths))
2001                 return true;
2002
2003         if (context->n_bind_mounts > 0)
2004                 return true;
2005
2006         if (context->n_temporary_filesystems > 0)
2007                 return true;
2008
2009         if (context->n_mount_images > 0)
2010                 return true;
2011
2012         if (!IN_SET(context->mount_flags, 0, MS_SHARED))
2013                 return true;
2014
2015         if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
2016                 return true;
2017
2018         if (context->private_devices ||
2019             context->private_mounts ||
2020             context->protect_system != PROTECT_SYSTEM_NO ||
2021             context->protect_home != PROTECT_HOME_NO ||
2022             context->protect_kernel_tunables ||
2023             context->protect_kernel_modules ||
2024             context->protect_kernel_logs ||
2025             context->protect_control_groups ||
2026             context->protect_proc != PROTECT_PROC_DEFAULT ||
2027             context->proc_subset != PROC_SUBSET_ALL)
2028                 return true;
2029
2030         if (context->root_directory) {
2031                 if (exec_context_get_effective_mount_apivfs(context))
2032                         return true;
2033
2034                 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2035                         if (!params->prefix[t])
2036                                 continue;
2037
2038                         if (!strv_isempty(context->directories[t].paths))
2039                                 return true;
2040                 }
2041         }
2042
2043         if (context->dynamic_user &&
2044             (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
2045              !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
2046              !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
2047                 return true;
2048
2049         if (context->log_namespace)
2050                 return true;
2051
2052         return false;
2053 }
2054
2055 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
2056         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2057         _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
2058         _cleanup_close_ int unshare_ready_fd = -1;
2059         _cleanup_(sigkill_waitp) pid_t pid = 0;
2060         uint64_t c = 1;
2061         ssize_t n;
2062         int r;
2063
2064         /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2065          * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2066          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2067          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2068          * which waits for the parent to create the new user namespace while staying in the original namespace. The
2069          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2070          * continues execution normally.
2071          * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2072          * does not need CAP_SETUID to write the single line mapping to itself. */
2073
2074         /* Can only set up multiple mappings with CAP_SETUID. */
2075         if (have_effective_cap(CAP_SETUID) && uid != ouid && uid_is_valid(uid))
2076                 r = asprintf(&uid_map,
2077                              UID_FMT " " UID_FMT " 1\n"     /* Map $OUID → $OUID */
2078                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
2079                              ouid, ouid, uid, uid);
2080         else
2081                 r = asprintf(&uid_map,
2082                              UID_FMT " " UID_FMT " 1\n",    /* Map $OUID → $OUID */
2083                              ouid, ouid);
2084
2085         if (r < 0)
2086                 return -ENOMEM;
2087
2088         /* Can only set up multiple mappings with CAP_SETGID. */
2089         if (have_effective_cap(CAP_SETGID) && gid != ogid && gid_is_valid(gid))
2090                 r = asprintf(&gid_map,
2091                              GID_FMT " " GID_FMT " 1\n"     /* Map $OGID → $OGID */
2092                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
2093                              ogid, ogid, gid, gid);
2094         else
2095                 r = asprintf(&gid_map,
2096                              GID_FMT " " GID_FMT " 1\n",    /* Map $OGID -> $OGID */
2097                              ogid, ogid);
2098
2099         if (r < 0)
2100                 return -ENOMEM;
2101
2102         /* Create a communication channel so that the parent can tell the child when it finished creating the user
2103          * namespace. */
2104         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2105         if (unshare_ready_fd < 0)
2106                 return -errno;
2107
2108         /* Create a communication channel so that the child can tell the parent a proper error code in case it
2109          * failed. */
2110         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2111                 return -errno;
2112
2113         r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2114         if (r < 0)
2115                 return r;
2116         if (r == 0) {
2117                 _cleanup_close_ int fd = -1;
2118                 const char *a;
2119                 pid_t ppid;
2120
2121                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2122                  * here, after the parent opened its own user namespace. */
2123
2124                 ppid = getppid();
2125                 errno_pipe[0] = safe_close(errno_pipe[0]);
2126
2127                 /* Wait until the parent unshared the user namespace */
2128                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2129                         r = -errno;
2130                         goto child_fail;
2131                 }
2132
2133                 /* Disable the setgroups() system call in the child user namespace, for good. */
2134                 a = procfs_file_alloca(ppid, "setgroups");
2135                 fd = open(a, O_WRONLY|O_CLOEXEC);
2136                 if (fd < 0) {
2137                         if (errno != ENOENT) {
2138                                 r = -errno;
2139                                 goto child_fail;
2140                         }
2141
2142                         /* If the file is missing the kernel is too old, let's continue anyway. */
2143                 } else {
2144                         if (write(fd, "deny\n", 5) < 0) {
2145                                 r = -errno;
2146                                 goto child_fail;
2147                         }
2148
2149                         fd = safe_close(fd);
2150                 }
2151
2152                 /* First write the GID map */
2153                 a = procfs_file_alloca(ppid, "gid_map");
2154                 fd = open(a, O_WRONLY|O_CLOEXEC);
2155                 if (fd < 0) {
2156                         r = -errno;
2157                         goto child_fail;
2158                 }
2159                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2160                         r = -errno;
2161                         goto child_fail;
2162                 }
2163                 fd = safe_close(fd);
2164
2165                 /* The write the UID map */
2166                 a = procfs_file_alloca(ppid, "uid_map");
2167                 fd = open(a, O_WRONLY|O_CLOEXEC);
2168                 if (fd < 0) {
2169                         r = -errno;
2170                         goto child_fail;
2171                 }
2172                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2173                         r = -errno;
2174                         goto child_fail;
2175                 }
2176
2177                 _exit(EXIT_SUCCESS);
2178
2179         child_fail:
2180                 (void) write(errno_pipe[1], &r, sizeof(r));
2181                 _exit(EXIT_FAILURE);
2182         }
2183
2184         errno_pipe[1] = safe_close(errno_pipe[1]);
2185
2186         if (unshare(CLONE_NEWUSER) < 0)
2187                 return -errno;
2188
2189         /* Let the child know that the namespace is ready now */
2190         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2191                 return -errno;
2192
2193         /* Try to read an error code from the child */
2194         n = read(errno_pipe[0], &r, sizeof(r));
2195         if (n < 0)
2196                 return -errno;
2197         if (n == sizeof(r)) { /* an error code was sent to us */
2198                 if (r < 0)
2199                         return r;
2200                 return -EIO;
2201         }
2202         if (n != 0) /* on success we should have read 0 bytes */
2203                 return -EIO;
2204
2205         r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
2206         pid = 0;
2207         if (r < 0)
2208                 return r;
2209         if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2210                 return -EIO;
2211
2212         return 0;
2213 }
2214
2215 static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2216         if (!context->dynamic_user)
2217                 return false;
2218
2219         if (type == EXEC_DIRECTORY_CONFIGURATION)
2220                 return false;
2221
2222         if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2223                 return false;
2224
2225         return true;
2226 }
2227
2228 static int setup_exec_directory(
2229                 const ExecContext *context,
2230                 const ExecParameters *params,
2231                 uid_t uid,
2232                 gid_t gid,
2233                 ExecDirectoryType type,
2234                 int *exit_status) {
2235
2236         static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2237                 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2238                 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2239                 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2240                 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2241                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2242         };
2243         char **rt;
2244         int r;
2245
2246         assert(context);
2247         assert(params);
2248         assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2249         assert(exit_status);
2250
2251         if (!params->prefix[type])
2252                 return 0;
2253
2254         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2255                 if (!uid_is_valid(uid))
2256                         uid = 0;
2257                 if (!gid_is_valid(gid))
2258                         gid = 0;
2259         }
2260
2261         STRV_FOREACH(rt, context->directories[type].paths) {
2262                 _cleanup_free_ char *p = NULL, *pp = NULL;
2263
2264                 p = path_join(params->prefix[type], *rt);
2265                 if (!p) {
2266                         r = -ENOMEM;
2267                         goto fail;
2268                 }
2269
2270                 r = mkdir_parents_label(p, 0755);
2271                 if (r < 0)
2272                         goto fail;
2273
2274                 if (exec_directory_is_private(context, type)) {
2275                         _cleanup_free_ char *private_root = NULL;
2276
2277                         /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2278                          * case we want to avoid leaving a directory around fully accessible that is owned by
2279                          * a dynamic user whose UID is later on reused. To lock this down we use the same
2280                          * trick used by container managers to prohibit host users to get access to files of
2281                          * the same UID in containers: we place everything inside a directory that has an
2282                          * access mode of 0700 and is owned root:root, so that it acts as security boundary
2283                          * for unprivileged host code. We then use fs namespacing to make this directory
2284                          * permeable for the service itself.
2285                          *
2286                          * Specifically: for a service which wants a special directory "foo/" we first create
2287                          * a directory "private/" with access mode 0700 owned by root:root. Then we place
2288                          * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2289                          * "private/foo". This way, privileged host users can access "foo/" as usual, but
2290                          * unprivileged host users can't look into it. Inside of the namespace of the unit
2291                          * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2292                          * "private/foo/" is mounted under the same name, thus disabling the access boundary
2293                          * for the service and making sure it only gets access to the dirs it needs but no
2294                          * others. Tricky? Yes, absolutely, but it works!
2295                          *
2296                          * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2297                          * to be owned by the service itself.
2298                          *
2299                          * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2300                          * for sharing files or sockets with other services. */
2301
2302                         private_root = path_join(params->prefix[type], "private");
2303                         if (!private_root) {
2304                                 r = -ENOMEM;
2305                                 goto fail;
2306                         }
2307
2308                         /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2309                         r = mkdir_safe_label(private_root, 0700, 0, 0, MKDIR_WARN_MODE);
2310                         if (r < 0)
2311                                 goto fail;
2312
2313                         pp = path_join(private_root, *rt);
2314                         if (!pp) {
2315                                 r = -ENOMEM;
2316                                 goto fail;
2317                         }
2318
2319                         /* Create all directories between the configured directory and this private root, and mark them 0755 */
2320                         r = mkdir_parents_label(pp, 0755);
2321                         if (r < 0)
2322                                 goto fail;
2323
2324                         if (is_dir(p, false) > 0 &&
2325                             (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2326
2327                                 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2328                                  * it over. Most likely the service has been upgraded from one that didn't use
2329                                  * DynamicUser=1, to one that does. */
2330
2331                                 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2332                                          "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2333                                          exec_directory_type_to_string(type), p, pp);
2334
2335                                 if (rename(p, pp) < 0) {
2336                                         r = -errno;
2337                                         goto fail;
2338                                 }
2339                         } else {
2340                                 /* Otherwise, create the actual directory for the service */
2341
2342                                 r = mkdir_label(pp, context->directories[type].mode);
2343                                 if (r < 0 && r != -EEXIST)
2344                                         goto fail;
2345                         }
2346
2347                         /* And link it up from the original place */
2348                         r = symlink_idempotent(pp, p, true);
2349                         if (r < 0)
2350                                 goto fail;
2351
2352                 } else {
2353                         _cleanup_free_ char *target = NULL;
2354
2355                         if (type != EXEC_DIRECTORY_CONFIGURATION &&
2356                             readlink_and_make_absolute(p, &target) >= 0) {
2357                                 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2358
2359                                 /* This already exists and is a symlink? Interesting. Maybe it's one created
2360                                  * by DynamicUser=1 (see above)?
2361                                  *
2362                                  * We do this for all directory types except for ConfigurationDirectory=,
2363                                  * since they all support the private/ symlink logic at least in some
2364                                  * configurations, see above. */
2365
2366                                 r = chase_symlinks(target, NULL, 0, &target_resolved, NULL);
2367                                 if (r < 0)
2368                                         goto fail;
2369
2370                                 q = path_join(params->prefix[type], "private", *rt);
2371                                 if (!q) {
2372                                         r = -ENOMEM;
2373                                         goto fail;
2374                                 }
2375
2376                                 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2377                                 r = chase_symlinks(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2378                                 if (r < 0)
2379                                         goto fail;
2380
2381                                 if (path_equal(q_resolved, target_resolved)) {
2382
2383                                         /* Hmm, apparently DynamicUser= was once turned on for this service,
2384                                          * but is no longer. Let's move the directory back up. */
2385
2386                                         log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2387                                                  "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2388                                                  exec_directory_type_to_string(type), q, p);
2389
2390                                         if (unlink(p) < 0) {
2391                                                 r = -errno;
2392                                                 goto fail;
2393                                         }
2394
2395                                         if (rename(q, p) < 0) {
2396                                                 r = -errno;
2397                                                 goto fail;
2398                                         }
2399                                 }
2400                         }
2401
2402                         r = mkdir_label(p, context->directories[type].mode);
2403                         if (r < 0) {
2404                                 if (r != -EEXIST)
2405                                         goto fail;
2406
2407                                 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2408                                         struct stat st;
2409
2410                                         /* Don't change the owner/access mode of the configuration directory,
2411                                          * as in the common case it is not written to by a service, and shall
2412                                          * not be writable. */
2413
2414                                         if (stat(p, &st) < 0) {
2415                                                 r = -errno;
2416                                                 goto fail;
2417                                         }
2418
2419                                         /* Still complain if the access mode doesn't match */
2420                                         if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2421                                                 log_warning("%s \'%s\' already exists but the mode is different. "
2422                                                             "(File system: %o %sMode: %o)",
2423                                                             exec_directory_type_to_string(type), *rt,
2424                                                             st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2425
2426                                         continue;
2427                                 }
2428                         }
2429                 }
2430
2431                 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2432                  * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2433                  * current UID/GID ownership.) */
2434                 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2435                 if (r < 0)
2436                         goto fail;
2437
2438                 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2439                  * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2440                  * assignments to exist.*/
2441                 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
2442                 if (r < 0)
2443                         goto fail;
2444         }
2445
2446         return 0;
2447
2448 fail:
2449         *exit_status = exit_status_table[type];
2450         return r;
2451 }
2452
2453 static int write_credential(
2454                 int dfd,
2455                 const char *id,
2456                 const void *data,
2457                 size_t size,
2458                 uid_t uid,
2459                 bool ownership_ok) {
2460
2461         _cleanup_(unlink_and_freep) char *tmp = NULL;
2462         _cleanup_close_ int fd = -1;
2463         int r;
2464
2465         r = tempfn_random_child("", "cred", &tmp);
2466         if (r < 0)
2467                 return r;
2468
2469         fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
2470         if (fd < 0) {
2471                 tmp = mfree(tmp);
2472                 return -errno;
2473         }
2474
2475         r = loop_write(fd, data, size, /* do_pool = */ false);
2476         if (r < 0)
2477                 return r;
2478
2479         if (fchmod(fd, 0400) < 0) /* Take away "w" bit */
2480                 return -errno;
2481
2482         if (uid_is_valid(uid) && uid != getuid()) {
2483                 r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
2484                 if (r < 0) {
2485                         if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2486                                 return r;
2487
2488                         if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want
2489                                             * to express: that the user gets read access and nothing
2490                                             * else. But if the backing fs can't support that (e.g. ramfs)
2491                                             * then we can use file ownership instead. But that's only safe if
2492                                             * we can then re-mount the whole thing read-only, so that the
2493                                             * user can no longer chmod() the file to gain write access. */
2494                                 return r;
2495
2496                         if (fchown(fd, uid, (gid_t) -1) < 0)
2497                                 return -errno;
2498                 }
2499         }
2500
2501         if (renameat(dfd, tmp, dfd, id) < 0)
2502                 return -errno;
2503
2504         tmp = mfree(tmp);
2505         return 0;
2506 }
2507
2508 #define CREDENTIALS_BYTES_MAX (1024LU * 1024LU) /* Refuse to pass more than 1M, after all this is unswappable memory */
2509
2510 static int acquire_credentials(
2511                 const ExecContext *context,
2512                 const ExecParameters *params,
2513                 const char *unit,
2514                 const char *p,
2515                 uid_t uid,
2516                 bool ownership_ok) {
2517
2518         uint64_t left = CREDENTIALS_BYTES_MAX;
2519         _cleanup_close_ int dfd = -1;
2520         ExecSetCredential *sc;
2521         char **id, **fn;
2522         int r;
2523
2524         assert(context);
2525         assert(p);
2526
2527         dfd = open(p, O_DIRECTORY|O_CLOEXEC);
2528         if (dfd < 0)
2529                 return -errno;
2530
2531         /* First we use the literally specified credentials. Note that they might be overridden again below,
2532          * and thus act as a "default" if the same credential is specified multiple times */
2533         HASHMAP_FOREACH(sc, context->set_credentials) {
2534                 size_t add;
2535
2536                 add = strlen(sc->id) + sc->size;
2537                 if (add > left)
2538                         return -E2BIG;
2539
2540                 r = write_credential(dfd, sc->id, sc->data, sc->size, uid, ownership_ok);
2541                 if (r < 0)
2542                         return r;
2543
2544                 left -= add;
2545         }
2546
2547         /* Then, load credential off disk (or acquire via AF_UNIX socket) */
2548         STRV_FOREACH_PAIR(id, fn, context->load_credentials) {
2549                 ReadFullFileFlags flags = READ_FULL_FILE_SECURE;
2550                 _cleanup_(erase_and_freep) char *data = NULL;
2551                 _cleanup_free_ char *j = NULL, *bindname = NULL;
2552                 const char *source;
2553                 size_t size, add;
2554
2555                 if (path_is_absolute(*fn)) {
2556                         /* If this is an absolute path, read the data directly from it, and support AF_UNIX sockets */
2557                         source = *fn;
2558                         flags |= READ_FULL_FILE_CONNECT_SOCKET;
2559
2560                         /* Pass some minimal info about the unit and the credential name we are looking to acquire
2561                          * via the source socket address in case we read off an AF_UNIX socket. */
2562                         if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, *id) < 0)
2563                                 return -ENOMEM;
2564
2565                 } else if (params->received_credentials) {
2566                         /* If this is a relative path, take it relative to the credentials we received
2567                          * ourselves. We don't support the AF_UNIX stuff in this mode, since we are operating
2568                          * on a credential store, i.e. this is guaranteed to be regular files. */
2569                         j = path_join(params->received_credentials, *fn);
2570                         if (!j)
2571                                 return -ENOMEM;
2572
2573                         source = j;
2574                 } else
2575                         source = NULL;
2576
2577
2578                 if (source)
2579                         r = read_full_file_full(AT_FDCWD, source, UINT64_MAX, SIZE_MAX, flags, bindname, &data, &size);
2580                 else
2581                         r = -ENOENT;
2582                 if (r == -ENOENT &&
2583                     faccessat(dfd, *id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0) /* If the source file doesn't exist, but we already acquired the key otherwise, then don't fail */
2584                         continue;
2585                 if (r < 0)
2586                         return r;
2587
2588                 add = strlen(*id) + size;
2589                 if (add > left)
2590                         return -E2BIG;
2591
2592                 r = write_credential(dfd, *id, data, size, uid, ownership_ok);
2593                 if (r < 0)
2594                         return r;
2595
2596                 left -= add;
2597         }
2598
2599         if (fchmod(dfd, 0500) < 0) /* Now take away the "w" bit */
2600                 return -errno;
2601
2602         /* After we created all keys with the right perms, also make sure the credential store as a whole is
2603          * accessible */
2604
2605         if (uid_is_valid(uid) && uid != getuid()) {
2606                 r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE);
2607                 if (r < 0) {
2608                         if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2609                                 return r;
2610
2611                         if (!ownership_ok)
2612                                 return r;
2613
2614                         if (fchown(dfd, uid, (gid_t) -1) < 0)
2615                                 return -errno;
2616                 }
2617         }
2618
2619         return 0;
2620 }
2621
2622 static int setup_credentials_internal(
2623                 const ExecContext *context,
2624                 const ExecParameters *params,
2625                 const char *unit,
2626                 const char *final,        /* This is where the credential store shall eventually end up at */
2627                 const char *workspace,    /* This is where we can prepare it before moving it to the final place */
2628                 bool reuse_workspace,     /* Whether to reuse any existing workspace mount if it already is a mount */
2629                 bool must_mount,          /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
2630                 uid_t uid) {
2631
2632         int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
2633                                    * if we mounted something; false if we definitely can't mount anything */
2634         bool final_mounted;
2635         const char *where;
2636
2637         assert(context);
2638         assert(final);
2639         assert(workspace);
2640
2641         if (reuse_workspace) {
2642                 r = path_is_mount_point(workspace, NULL, 0);
2643                 if (r < 0)
2644                         return r;
2645                 if (r > 0)
2646                         workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse it, let's keep this in mind */
2647                 else
2648                         workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
2649         } else
2650                 workspace_mounted = -1; /* ditto */
2651
2652         r = path_is_mount_point(final, NULL, 0);
2653         if (r < 0)
2654                 return r;
2655         if (r > 0) {
2656                 /* If the final place already has something mounted, we use that. If the workspace also has
2657                  * something mounted we assume it's actually the same mount (but with MS_RDONLY
2658                  * different). */
2659                 final_mounted = true;
2660
2661                 if (workspace_mounted < 0) {
2662                         /* If the final place is mounted, but the workspace we isn't, then let's bind mount
2663                          * the final version to the workspace, and make it writable, so that we can make
2664                          * changes */
2665
2666                         r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
2667                         if (r < 0)
2668                                 return r;
2669
2670                         r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
2671                         if (r < 0)
2672                                 return r;
2673
2674                         workspace_mounted = true;
2675                 }
2676         } else
2677                 final_mounted = false;
2678
2679         if (workspace_mounted < 0) {
2680                 /* Nothing is mounted on the workspace yet, let's try to mount something now */
2681                 for (int try = 0;; try++) {
2682
2683                         if (try == 0) {
2684                                 /* Try "ramfs" first, since it's not swap backed */
2685                                 r = mount_nofollow_verbose(LOG_DEBUG, "ramfs", workspace, "ramfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, "mode=0700");
2686                                 if (r >= 0) {
2687                                         workspace_mounted = true;
2688                                         break;
2689                                 }
2690
2691                         } else if (try == 1) {
2692                                 _cleanup_free_ char *opts = NULL;
2693
2694                                 if (asprintf(&opts, "mode=0700,nr_inodes=1024,size=%lu", CREDENTIALS_BYTES_MAX) < 0)
2695                                         return -ENOMEM;
2696
2697                                 /* Fall back to "tmpfs" otherwise */
2698                                 r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", workspace, "tmpfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, opts);
2699                                 if (r >= 0) {
2700                                         workspace_mounted = true;
2701                                         break;
2702                                 }
2703
2704                         } else {
2705                                 /* If that didn't work, try to make a bind mount from the final to the workspace, so that we can make it writable there. */
2706                                 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
2707                                 if (r < 0) {
2708                                         if (!ERRNO_IS_PRIVILEGE(r)) /* Propagate anything that isn't a permission problem */
2709                                                 return r;
2710
2711                                         if (must_mount) /* If we it's not OK to use the plain directory
2712                                                          * fallback, propagate all errors too */
2713                                                 return r;
2714
2715                                         /* If we lack privileges to bind mount stuff, then let's gracefully
2716                                          * proceed for compat with container envs, and just use the final dir
2717                                          * as is. */
2718
2719                                         workspace_mounted = false;
2720                                         break;
2721                                 }
2722
2723                                 /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
2724                                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
2725                                 if (r < 0)
2726                                         return r;
2727
2728                                 workspace_mounted = true;
2729                                 break;
2730                         }
2731                 }
2732         }
2733
2734         assert(!must_mount || workspace_mounted > 0);
2735         where = workspace_mounted ? workspace : final;
2736
2737         r = acquire_credentials(context, params, unit, where, uid, workspace_mounted);
2738         if (r < 0)
2739                 return r;
2740
2741         if (workspace_mounted) {
2742                 /* Make workspace read-only now, so that any bind mount we make from it defaults to read-only too */
2743                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
2744                 if (r < 0)
2745                         return r;
2746
2747                 /* And mount it to the final place, read-only */
2748                 if (final_mounted)
2749                         r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
2750                 else
2751                         r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
2752                 if (r < 0)
2753                         return r;
2754         } else {
2755                 _cleanup_free_ char *parent = NULL;
2756
2757                 /* If we do not have our own mount put used the plain directory fallback, then we need to
2758                  * open access to the top-level credential directory and the per-service directory now */
2759
2760                 parent = dirname_malloc(final);
2761                 if (!parent)
2762                         return -ENOMEM;
2763                 if (chmod(parent, 0755) < 0)
2764                         return -errno;
2765         }
2766
2767         return 0;
2768 }
2769
2770 static int setup_credentials(
2771                 const ExecContext *context,
2772                 const ExecParameters *params,
2773                 const char *unit,
2774                 uid_t uid) {
2775
2776         _cleanup_free_ char *p = NULL, *q = NULL;
2777         const char *i;
2778         int r;
2779
2780         assert(context);
2781         assert(params);
2782
2783         if (!exec_context_has_credentials(context))
2784                 return 0;
2785
2786         if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
2787                 return -EINVAL;
2788
2789         /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
2790          * and the subdir we mount over with a read-only file system readable by the service's user */
2791         q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials");
2792         if (!q)
2793                 return -ENOMEM;
2794
2795         r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */
2796         if (r < 0 && r != -EEXIST)
2797                 return r;
2798
2799         p = path_join(q, unit);
2800         if (!p)
2801                 return -ENOMEM;
2802
2803         r = mkdir_label(p, 0700); /* per-unit dir: private to user */
2804         if (r < 0 && r != -EEXIST)
2805                 return r;
2806
2807         r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
2808         if (r < 0) {
2809                 _cleanup_free_ char *t = NULL, *u = NULL;
2810
2811                 /* If this is not a privilege or support issue then propagate the error */
2812                 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2813                         return r;
2814
2815                 /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
2816                  * it into place, so that users can't access half-initialized credential stores. */
2817                 t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
2818                 if (!t)
2819                         return -ENOMEM;
2820
2821                 /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
2822                  * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
2823                  * after it is fully set up */
2824                 u = path_join(t, unit);
2825                 if (!u)
2826                         return -ENOMEM;
2827
2828                 FOREACH_STRING(i, t, u) {
2829                         r = mkdir_label(i, 0700);
2830                         if (r < 0 && r != -EEXIST)
2831                                 return r;
2832                 }
2833
2834                 r = setup_credentials_internal(
2835                                 context,
2836                                 params,
2837                                 unit,
2838                                 p,       /* final mount point */
2839                                 u,       /* temporary workspace to overmount */
2840                                 true,    /* reuse the workspace if it is already a mount */
2841                                 false,   /* it's OK to fall back to a plain directory if we can't mount anything */
2842                                 uid);
2843
2844                 (void) rmdir(u); /* remove the workspace again if we can. */
2845
2846                 if (r < 0)
2847                         return r;
2848
2849         } else if (r == 0) {
2850
2851                 /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
2852                  * we can use the same directory for all cases, after turning off propagation. Question
2853                  * though is: where do we turn off propagation exactly, and where do we place the workspace
2854                  * directory? We need some place that is guaranteed to be a mount point in the host, and
2855                  * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
2856                  * since we ultimately want to move the resulting file system there, i.e. we need propagation
2857                  * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
2858                  * would be visible in the host mount table all the time, which we want to avoid. Hence, what
2859                  * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
2860                  * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
2861                  * propagation on the former, and then overmount the latter.
2862                  *
2863                  * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
2864                  * for this purpose, but there are few other candidates that work equally well for us, and
2865                  * given that the we do this in a privately namespaced short-lived single-threaded process
2866                  * that no one else sees this should be OK to do.*/
2867
2868                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL); /* Turn off propagation from our namespace to host */
2869                 if (r < 0)
2870                         goto child_fail;
2871
2872                 r = setup_credentials_internal(
2873                                 context,
2874                                 params,
2875                                 unit,
2876                                 p,           /* final mount point */
2877                                 "/dev/shm",  /* temporary workspace to overmount */
2878                                 false,       /* do not reuse /dev/shm if it is already a mount, under no circumstances */
2879                                 true,        /* insist that something is mounted, do not allow fallback to plain directory */
2880                                 uid);
2881                 if (r < 0)
2882                         goto child_fail;
2883
2884                 _exit(EXIT_SUCCESS);
2885
2886         child_fail:
2887                 _exit(EXIT_FAILURE);
2888         }
2889
2890         return 0;
2891 }
2892
2893 #if ENABLE_SMACK
2894 static int setup_smack(
2895                 const ExecContext *context,
2896                 int executable_fd) {
2897         int r;
2898
2899         assert(context);
2900         assert(executable_fd >= 0);
2901
2902         if (context->smack_process_label) {
2903                 r = mac_smack_apply_pid(0, context->smack_process_label);
2904                 if (r < 0)
2905                         return r;
2906         }
2907 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2908         else {
2909                 _cleanup_free_ char *exec_label = NULL;
2910
2911                 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
2912                 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
2913                         return r;
2914
2915                 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2916                 if (r < 0)
2917                         return r;
2918         }
2919 #endif
2920
2921         return 0;
2922 }
2923 #endif
2924
2925 static int compile_bind_mounts(
2926                 const ExecContext *context,
2927                 const ExecParameters *params,
2928                 BindMount **ret_bind_mounts,
2929                 size_t *ret_n_bind_mounts,
2930                 char ***ret_empty_directories) {
2931
2932         _cleanup_strv_free_ char **empty_directories = NULL;
2933         BindMount *bind_mounts;
2934         size_t n, h = 0;
2935         int r;
2936
2937         assert(context);
2938         assert(params);
2939         assert(ret_bind_mounts);
2940         assert(ret_n_bind_mounts);
2941         assert(ret_empty_directories);
2942
2943         n = context->n_bind_mounts;
2944         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2945                 if (!params->prefix[t])
2946                         continue;
2947
2948                 n += strv_length(context->directories[t].paths);
2949         }
2950
2951         if (n <= 0) {
2952                 *ret_bind_mounts = NULL;
2953                 *ret_n_bind_mounts = 0;
2954                 *ret_empty_directories = NULL;
2955                 return 0;
2956         }
2957
2958         bind_mounts = new(BindMount, n);
2959         if (!bind_mounts)
2960                 return -ENOMEM;
2961
2962         for (size_t i = 0; i < context->n_bind_mounts; i++) {
2963                 BindMount *item = context->bind_mounts + i;
2964                 char *s, *d;
2965
2966                 s = strdup(item->source);
2967                 if (!s) {
2968                         r = -ENOMEM;
2969                         goto finish;
2970                 }
2971
2972                 d = strdup(item->destination);
2973                 if (!d) {
2974                         free(s);
2975                         r = -ENOMEM;
2976                         goto finish;
2977                 }
2978
2979                 bind_mounts[h++] = (BindMount) {
2980                         .source = s,
2981                         .destination = d,
2982                         .read_only = item->read_only,
2983                         .recursive = item->recursive,
2984                         .ignore_enoent = item->ignore_enoent,
2985                 };
2986         }
2987
2988         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2989                 char **suffix;
2990
2991                 if (!params->prefix[t])
2992                         continue;
2993
2994                 if (strv_isempty(context->directories[t].paths))
2995                         continue;
2996
2997                 if (exec_directory_is_private(context, t) &&
2998                     !exec_context_with_rootfs(context)) {
2999                         char *private_root;
3000
3001                         /* So this is for a dynamic user, and we need to make sure the process can access its own
3002                          * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3003                          * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3004
3005                         private_root = path_join(params->prefix[t], "private");
3006                         if (!private_root) {
3007                                 r = -ENOMEM;
3008                                 goto finish;
3009                         }
3010
3011                         r = strv_consume(&empty_directories, private_root);
3012                         if (r < 0)
3013                                 goto finish;
3014                 }
3015
3016                 STRV_FOREACH(suffix, context->directories[t].paths) {
3017                         char *s, *d;
3018
3019                         if (exec_directory_is_private(context, t))
3020                                 s = path_join(params->prefix[t], "private", *suffix);
3021                         else
3022                                 s = path_join(params->prefix[t], *suffix);
3023                         if (!s) {
3024                                 r = -ENOMEM;
3025                                 goto finish;
3026                         }
3027
3028                         if (exec_directory_is_private(context, t) &&
3029                             exec_context_with_rootfs(context))
3030                                 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3031                                  * directory is not created on the root directory. So, let's bind-mount the directory
3032                                  * on the 'non-private' place. */
3033                                 d = path_join(params->prefix[t], *suffix);
3034                         else
3035                                 d = strdup(s);
3036                         if (!d) {
3037                                 free(s);
3038                                 r = -ENOMEM;
3039                                 goto finish;
3040                         }
3041
3042                         bind_mounts[h++] = (BindMount) {
3043                                 .source = s,
3044                                 .destination = d,
3045                                 .read_only = false,
3046                                 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
3047                                 .recursive = true,
3048                                 .ignore_enoent = false,
3049                         };
3050                 }
3051         }
3052
3053         assert(h == n);
3054
3055         *ret_bind_mounts = bind_mounts;
3056         *ret_n_bind_mounts = n;
3057         *ret_empty_directories = TAKE_PTR(empty_directories);
3058
3059         return (int) n;
3060
3061 finish:
3062         bind_mount_free_many(bind_mounts, h);
3063         return r;
3064 }
3065
3066 static bool insist_on_sandboxing(
3067                 const ExecContext *context,
3068                 const char *root_dir,
3069                 const char *root_image,
3070                 const BindMount *bind_mounts,
3071                 size_t n_bind_mounts) {
3072
3073         assert(context);
3074         assert(n_bind_mounts == 0 || bind_mounts);
3075
3076         /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
3077          * would alter the view on the file system beyond making things read-only or invisible, i.e. would
3078          * rearrange stuff in a way we cannot ignore gracefully. */
3079
3080         if (context->n_temporary_filesystems > 0)
3081                 return true;
3082
3083         if (root_dir || root_image)
3084                 return true;
3085
3086         if (context->n_mount_images > 0)
3087                 return true;
3088
3089         if (context->dynamic_user)
3090                 return true;
3091
3092         /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3093          * essential. */
3094         for (size_t i = 0; i < n_bind_mounts; i++)
3095                 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
3096                         return true;
3097
3098         if (context->log_namespace)
3099                 return true;
3100
3101         return false;
3102 }
3103
3104 static int apply_mount_namespace(
3105                 const Unit *u,
3106                 ExecCommandFlags command_flags,
3107                 const ExecContext *context,
3108                 const ExecParameters *params,
3109                 const ExecRuntime *runtime,
3110                 char **error_path) {
3111
3112         _cleanup_strv_free_ char **empty_directories = NULL;
3113         const char *tmp_dir = NULL, *var_tmp_dir = NULL;
3114         const char *root_dir = NULL, *root_image = NULL;
3115         _cleanup_free_ char *creds_path = NULL;
3116         NamespaceInfo ns_info;
3117         bool needs_sandboxing;
3118         BindMount *bind_mounts = NULL;
3119         size_t n_bind_mounts = 0;
3120         int r;
3121
3122         assert(context);
3123
3124         if (params->flags & EXEC_APPLY_CHROOT) {
3125                 root_image = context->root_image;
3126
3127                 if (!root_image)
3128                         root_dir = context->root_directory;
3129         }
3130
3131         r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3132         if (r < 0)
3133                 return r;
3134
3135         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3136         if (needs_sandboxing) {
3137                 /* The runtime struct only contains the parent of the private /tmp,
3138                  * which is non-accessible to world users. Inside of it there's a /tmp
3139                  * that is sticky, and that's the one we want to use here.
3140                  * This does not apply when we are using /run/systemd/empty as fallback. */
3141
3142                 if (context->private_tmp && runtime) {
3143                         if (streq_ptr(runtime->tmp_dir, RUN_SYSTEMD_EMPTY))
3144                                 tmp_dir = runtime->tmp_dir;
3145                         else if (runtime->tmp_dir)
3146                                 tmp_dir = strjoina(runtime->tmp_dir, "/tmp");
3147
3148                         if (streq_ptr(runtime->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3149                                 var_tmp_dir = runtime->var_tmp_dir;
3150                         else if (runtime->var_tmp_dir)
3151                                 var_tmp_dir = strjoina(runtime->var_tmp_dir, "/tmp");
3152                 }
3153
3154                 ns_info = (NamespaceInfo) {
3155                         .ignore_protect_paths = false,
3156                         .private_dev = context->private_devices,
3157                         .protect_control_groups = context->protect_control_groups,
3158                         .protect_kernel_tunables = context->protect_kernel_tunables,
3159                         .protect_kernel_modules = context->protect_kernel_modules,
3160                         .protect_kernel_logs = context->protect_kernel_logs,
3161                         .protect_hostname = context->protect_hostname,
3162                         .mount_apivfs = exec_context_get_effective_mount_apivfs(context),
3163                         .private_mounts = context->private_mounts,
3164                         .protect_home = context->protect_home,
3165                         .protect_system = context->protect_system,
3166                         .protect_proc = context->protect_proc,
3167                         .proc_subset = context->proc_subset,
3168                 };
3169         } else if (!context->dynamic_user && root_dir)
3170                 /*
3171                  * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
3172                  * sandbox info, otherwise enforce it, don't ignore protected paths and
3173                  * fail if we are enable to apply the sandbox inside the mount namespace.
3174                  */
3175                 ns_info = (NamespaceInfo) {
3176                         .ignore_protect_paths = true,
3177                 };
3178         else
3179                 ns_info = (NamespaceInfo) {};
3180
3181         if (context->mount_flags == MS_SHARED)
3182                 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3183
3184         if (exec_context_has_credentials(context) && params->prefix[EXEC_DIRECTORY_RUNTIME]) {
3185                 creds_path = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials", u->id);
3186                 if (!creds_path) {
3187                         r = -ENOMEM;
3188                         goto finalize;
3189                 }
3190         }
3191
3192         r = setup_namespace(root_dir, root_image, context->root_image_options,
3193                             &ns_info, context->read_write_paths,
3194                             needs_sandboxing ? context->read_only_paths : NULL,
3195                             needs_sandboxing ? context->inaccessible_paths : NULL,
3196                             empty_directories,
3197                             bind_mounts,
3198                             n_bind_mounts,
3199                             context->temporary_filesystems,
3200                             context->n_temporary_filesystems,
3201                             context->mount_images,
3202                             context->n_mount_images,
3203                             tmp_dir,
3204                             var_tmp_dir,
3205                             creds_path,
3206                             context->log_namespace,
3207                             context->mount_flags,
3208                             context->root_hash, context->root_hash_size, context->root_hash_path,
3209                             context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3210                             context->root_verity,
3211                             DISSECT_IMAGE_DISCARD_ON_LOOP|DISSECT_IMAGE_RELAX_VAR_CHECK|DISSECT_IMAGE_FSCK,
3212                             error_path);
3213
3214         /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
3215          * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
3216          * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3217          * completely different execution environment. */
3218         if (r == -ENOANO) {
3219                 if (insist_on_sandboxing(
3220                                     context,
3221                                     root_dir, root_image,
3222                                     bind_mounts,
3223                                     n_bind_mounts)) {
3224                         log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
3225                                        "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3226                                        n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
3227
3228                         r = -EOPNOTSUPP;
3229                 } else {
3230                         log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
3231                         r = 0;
3232                 }
3233         }
3234
3235 finalize:
3236         bind_mount_free_many(bind_mounts, n_bind_mounts);
3237         return r;
3238 }
3239
3240 static int apply_working_directory(
3241                 const ExecContext *context,
3242                 const ExecParameters *params,
3243                 const char *home,
3244                 int *exit_status) {
3245
3246         const char *d, *wd;
3247
3248         assert(context);
3249         assert(exit_status);
3250
3251         if (context->working_directory_home) {
3252
3253                 if (!home) {
3254                         *exit_status = EXIT_CHDIR;
3255                         return -ENXIO;
3256                 }
3257
3258                 wd = home;
3259
3260         } else
3261                 wd = empty_to_root(context->working_directory);
3262
3263         if (params->flags & EXEC_APPLY_CHROOT)
3264                 d = wd;
3265         else
3266                 d = prefix_roota(context->root_directory, wd);
3267
3268         if (chdir(d) < 0 && !context->working_directory_missing_ok) {
3269                 *exit_status = EXIT_CHDIR;
3270                 return -errno;
3271         }
3272
3273         return 0;
3274 }
3275
3276 static int apply_root_directory(
3277                 const ExecContext *context,
3278                 const ExecParameters *params,
3279                 const bool needs_mount_ns,
3280                 int *exit_status) {
3281
3282         assert(context);
3283         assert(exit_status);
3284
3285         if (params->flags & EXEC_APPLY_CHROOT)
3286                 if (!needs_mount_ns && context->root_directory)
3287                         if (chroot(context->root_directory) < 0) {
3288                                 *exit_status = EXIT_CHROOT;
3289                                 return -errno;
3290                         }
3291
3292         return 0;
3293 }
3294
3295 static int setup_keyring(
3296                 const Unit *u,
3297                 const ExecContext *context,
3298                 const ExecParameters *p,
3299                 uid_t uid, gid_t gid) {
3300
3301         key_serial_t keyring;
3302         int r = 0;
3303         uid_t saved_uid;
3304         gid_t saved_gid;
3305
3306         assert(u);
3307         assert(context);
3308         assert(p);
3309
3310         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3311          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3312          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3313          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3314          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3315          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3316
3317         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3318                 return 0;
3319
3320         /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3321          * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3322          * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3323          * & group is just as nasty as acquiring a reference to the user keyring. */
3324
3325         saved_uid = getuid();
3326         saved_gid = getgid();
3327
3328         if (gid_is_valid(gid) && gid != saved_gid) {
3329                 if (setregid(gid, -1) < 0)
3330                         return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
3331         }
3332
3333         if (uid_is_valid(uid) && uid != saved_uid) {
3334                 if (setreuid(uid, -1) < 0) {
3335                         r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
3336                         goto out;
3337                 }
3338         }
3339
3340         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3341         if (keyring == -1) {
3342                 if (errno == ENOSYS)
3343                         log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
3344                 else if (ERRNO_IS_PRIVILEGE(errno))
3345                         log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
3346                 else if (errno == EDQUOT)
3347                         log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
3348                 else
3349                         r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
3350
3351                 goto out;
3352         }
3353
3354         /* When requested link the user keyring into the session keyring. */
3355         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3356
3357                 if (keyctl(KEYCTL_LINK,
3358                            KEY_SPEC_USER_KEYRING,
3359                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3360                         r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
3361                         goto out;
3362                 }
3363         }
3364
3365         /* Restore uid/gid back */
3366         if (uid_is_valid(uid) && uid != saved_uid) {
3367                 if (setreuid(saved_uid, -1) < 0) {
3368                         r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
3369                         goto out;
3370                 }
3371         }
3372
3373         if (gid_is_valid(gid) && gid != saved_gid) {
3374                 if (setregid(saved_gid, -1) < 0)
3375                         return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
3376         }
3377
3378         /* Populate they keyring with the invocation ID by default, as original saved_uid. */
3379         if (!sd_id128_is_null(u->invocation_id)) {
3380                 key_serial_t key;
3381
3382                 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
3383                 if (key == -1)
3384                         log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
3385                 else {
3386                         if (keyctl(KEYCTL_SETPERM, key,
3387                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3388                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
3389                                 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
3390                 }
3391         }
3392
3393 out:
3394         /* Revert back uid & gid for the last time, and exit */
3395         /* no extra logging, as only the first already reported error matters */
3396         if (getuid() != saved_uid)
3397                 (void) setreuid(saved_uid, -1);
3398
3399         if (getgid() != saved_gid)
3400                 (void) setregid(saved_gid, -1);
3401
3402         return r;
3403 }
3404
3405 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
3406         assert(array);
3407         assert(n);
3408         assert(pair);
3409
3410         if (pair[0] >= 0)
3411                 array[(*n)++] = pair[0];
3412         if (pair[1] >= 0)
3413                 array[(*n)++] = pair[1];
3414 }
3415
3416 static int close_remaining_fds(
3417                 const ExecParameters *params,
3418                 const ExecRuntime *runtime,
3419                 const DynamicCreds *dcreds,
3420                 int user_lookup_fd,
3421                 int socket_fd,
3422                 const int *fds, size_t n_fds) {
3423
3424         size_t n_dont_close = 0;
3425         int dont_close[n_fds + 12];
3426
3427         assert(params);
3428
3429         if (params->stdin_fd >= 0)
3430                 dont_close[n_dont_close++] = params->stdin_fd;
3431         if (params->stdout_fd >= 0)
3432                 dont_close[n_dont_close++] = params->stdout_fd;
3433         if (params->stderr_fd >= 0)
3434                 dont_close[n_dont_close++] = params->stderr_fd;
3435
3436         if (socket_fd >= 0)
3437                 dont_close[n_dont_close++] = socket_fd;
3438         if (n_fds > 0) {
3439                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3440                 n_dont_close += n_fds;
3441         }
3442
3443         if (runtime)
3444                 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
3445
3446         if (dcreds) {
3447                 if (dcreds->user)
3448                         append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
3449                 if (dcreds->group)
3450                         append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
3451         }
3452
3453         if (user_lookup_fd >= 0)
3454                 dont_close[n_dont_close++] = user_lookup_fd;
3455
3456         return close_all_fds(dont_close, n_dont_close);
3457 }
3458
3459 static int send_user_lookup(
3460                 Unit *unit,
3461                 int user_lookup_fd,
3462                 uid_t uid,
3463                 gid_t gid) {
3464
3465         assert(unit);
3466
3467         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3468          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3469          * specified. */
3470
3471         if (user_lookup_fd < 0)
3472                 return 0;
3473
3474         if (!uid_is_valid(uid) && !gid_is_valid(gid))
3475                 return 0;
3476
3477         if (writev(user_lookup_fd,
3478                (struct iovec[]) {
3479                            IOVEC_INIT(&uid, sizeof(uid)),
3480                            IOVEC_INIT(&gid, sizeof(gid)),
3481                            IOVEC_INIT_STRING(unit->id) }, 3) < 0)
3482                 return -errno;
3483
3484         return 0;
3485 }
3486
3487 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3488         int r;
3489
3490         assert(c);
3491         assert(home);
3492         assert(buf);
3493
3494         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3495
3496         if (*home)
3497                 return 0;
3498
3499         if (!c->working_directory_home)
3500                 return 0;
3501
3502         r = get_home_dir(buf);
3503         if (r < 0)
3504                 return r;
3505
3506         *home = *buf;
3507         return 1;
3508 }
3509
3510 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3511         _cleanup_strv_free_ char ** list = NULL;
3512         int r;
3513
3514         assert(c);
3515         assert(p);
3516         assert(ret);
3517
3518         assert(c->dynamic_user);
3519
3520         /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3521          * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3522          * directories. */
3523
3524         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3525                 char **i;
3526
3527                 if (t == EXEC_DIRECTORY_CONFIGURATION)
3528                         continue;
3529
3530                 if (!p->prefix[t])
3531                         continue;
3532
3533                 STRV_FOREACH(i, c->directories[t].paths) {
3534                         char *e;
3535
3536                         if (exec_directory_is_private(c, t))
3537                                 e = path_join(p->prefix[t], "private", *i);
3538                         else
3539                                 e = path_join(p->prefix[t], *i);
3540                         if (!e)
3541                                 return -ENOMEM;
3542
3543                         r = strv_consume(&list, e);
3544                         if (r < 0)
3545                                 return r;
3546                 }
3547         }
3548
3549         *ret = TAKE_PTR(list);
3550
3551         return 0;
3552 }
3553
3554 static char *exec_command_line(char **argv);
3555
3556 static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
3557         bool using_subcgroup;
3558         char *p;
3559
3560         assert(params);
3561         assert(ret);
3562
3563         if (!params->cgroup_path)
3564                 return -EINVAL;
3565
3566         /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
3567          * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
3568          * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
3569          * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
3570          * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
3571          * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
3572          * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
3573          * flag, which is only passed for the former statements, not for the latter. */
3574
3575         using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
3576         if (using_subcgroup)
3577                 p = path_join(params->cgroup_path, ".control");
3578         else
3579                 p = strdup(params->cgroup_path);
3580         if (!p)
3581                 return -ENOMEM;
3582
3583         *ret = p;
3584         return using_subcgroup;
3585 }
3586
3587 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
3588         _cleanup_(cpu_set_reset) CPUSet s = {};
3589         int r;
3590
3591         assert(c);
3592         assert(ret);
3593
3594         if (!c->numa_policy.nodes.set) {
3595                 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3596                 return 0;
3597         }
3598
3599         r = numa_to_cpu_set(&c->numa_policy, &s);
3600         if (r < 0)
3601                 return r;
3602
3603         cpu_set_reset(ret);
3604
3605         return cpu_set_add_all(ret, &s);
3606 }
3607
3608 bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
3609         assert(c);
3610
3611         return c->cpu_affinity_from_numa;
3612 }
3613
3614 static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
3615         int r;
3616
3617         assert(fds);
3618         assert(n_fds);
3619         assert(*n_fds < fds_size);
3620         assert(ret_fd);
3621
3622         if (fd < 0) {
3623                 *ret_fd = -1;
3624                 return 0;
3625         }
3626
3627         if (fd < 3 + (int) *n_fds) {
3628                 /* Let's move the fd up, so that it's outside of the fd range we will use to store
3629                  * the fds we pass to the process (or which are closed only during execve). */
3630
3631                 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
3632                 if (r < 0)
3633                         return -errno;
3634
3635                 CLOSE_AND_REPLACE(fd, r);
3636         }
3637
3638         *ret_fd = fds[*n_fds] = fd;
3639         (*n_fds) ++;
3640         return 1;
3641 }
3642
3643 static int exec_child(
3644                 Unit *unit,
3645                 const ExecCommand *command,
3646                 const ExecContext *context,
3647                 const ExecParameters *params,
3648                 ExecRuntime *runtime,
3649                 DynamicCreds *dcreds,
3650                 int socket_fd,
3651                 const int named_iofds[static 3],
3652                 int *fds,
3653                 size_t n_socket_fds,
3654                 size_t n_storage_fds,
3655                 char **files_env,
3656                 int user_lookup_fd,
3657                 int *exit_status) {
3658
3659         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **replaced_argv = NULL;
3660         int r, ngids = 0, exec_fd;
3661         _cleanup_free_ gid_t *supplementary_gids = NULL;
3662         const char *username = NULL, *groupname = NULL;
3663         _cleanup_free_ char *home_buffer = NULL;
3664         const char *home = NULL, *shell = NULL;
3665         char **final_argv = NULL;
3666         dev_t journal_stream_dev = 0;
3667         ino_t journal_stream_ino = 0;
3668         bool userns_set_up = false;
3669         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
3670                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
3671                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
3672                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
3673 #if HAVE_SELINUX
3674         _cleanup_free_ char *mac_selinux_context_net = NULL;
3675         bool use_selinux = false;
3676 #endif
3677 #if ENABLE_SMACK
3678         bool use_smack = false;
3679 #endif
3680 #if HAVE_APPARMOR
3681         bool use_apparmor = false;
3682 #endif
3683         uid_t saved_uid = getuid();
3684         gid_t saved_gid = getgid();
3685         uid_t uid = UID_INVALID;
3686         gid_t gid = GID_INVALID;
3687         size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
3688                n_keep_fds; /* total number of fds not to close */
3689         int secure_bits;
3690         _cleanup_free_ gid_t *gids_after_pam = NULL;
3691         int ngids_after_pam = 0;
3692
3693         assert(unit);
3694         assert(command);
3695         assert(context);
3696         assert(params);
3697         assert(exit_status);
3698
3699         rename_process_from_path(command->path);
3700
3701         /* We reset exactly these signals, since they are the
3702          * only ones we set to SIG_IGN in the main daemon. All
3703          * others we leave untouched because we set them to
3704          * SIG_DFL or a valid handler initially, both of which
3705          * will be demoted to SIG_DFL. */
3706         (void) default_signals(SIGNALS_CRASH_HANDLER,
3707                                SIGNALS_IGNORE, -1);
3708
3709         if (context->ignore_sigpipe)
3710                 (void) ignore_signals(SIGPIPE, -1);
3711
3712         r = reset_signal_mask();
3713         if (r < 0) {
3714                 *exit_status = EXIT_SIGNAL_MASK;
3715                 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
3716         }
3717
3718         if (params->idle_pipe)
3719                 do_idle_pipe_dance(params->idle_pipe);
3720
3721         /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
3722          * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
3723          * any fds open we don't really want open during the transition. In order to make logging work, we switch the
3724          * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
3725
3726         log_forget_fds();
3727         log_set_open_when_needed(true);
3728
3729         /* In case anything used libc syslog(), close this here, too */
3730         closelog();
3731
3732         int keep_fds[n_fds + 2];
3733         memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
3734         n_keep_fds = n_fds;
3735
3736         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
3737         if (r < 0) {
3738                 *exit_status = EXIT_FDS;
3739                 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
3740         }
3741
3742         r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
3743         if (r < 0) {
3744                 *exit_status = EXIT_FDS;
3745                 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
3746         }
3747
3748         if (!context->same_pgrp &&
3749             setsid() < 0) {
3750                 *exit_status = EXIT_SETSID;
3751                 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
3752         }
3753
3754         exec_context_tty_reset(context, params);
3755
3756         if (unit_shall_confirm_spawn(unit)) {
3757                 const char *vc = params->confirm_spawn;
3758                 _cleanup_free_ char *cmdline = NULL;
3759
3760                 cmdline = exec_command_line(command->argv);
3761                 if (!cmdline) {
3762                         *exit_status = EXIT_MEMORY;
3763                         return log_oom();
3764                 }
3765
3766                 r = ask_for_confirmation(vc, unit, cmdline);
3767                 if (r != CONFIRM_EXECUTE) {
3768                         if (r == CONFIRM_PRETEND_SUCCESS) {
3769                                 *exit_status = EXIT_SUCCESS;
3770                                 return 0;
3771                         }
3772                         *exit_status = EXIT_CONFIRM;
3773                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
3774                                                     "Execution cancelled by the user");
3775                 }
3776         }
3777
3778         /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
3779          * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
3780          * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
3781          * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
3782          * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
3783         if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
3784             setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
3785                 *exit_status = EXIT_MEMORY;
3786                 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3787         }
3788
3789         if (context->dynamic_user && dcreds) {
3790                 _cleanup_strv_free_ char **suggested_paths = NULL;
3791
3792                 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
3793                  * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here.*/
3794                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
3795                         *exit_status = EXIT_USER;
3796                         return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3797                 }
3798
3799                 r = compile_suggested_paths(context, params, &suggested_paths);
3800                 if (r < 0) {
3801                         *exit_status = EXIT_MEMORY;
3802                         return log_oom();
3803                 }
3804
3805                 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
3806                 if (r < 0) {
3807                         *exit_status = EXIT_USER;
3808                         if (r == -EILSEQ)
3809                                 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
3810                                                             "Failed to update dynamic user credentials: User or group with specified name already exists.");
3811                         return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
3812                 }
3813
3814                 if (!uid_is_valid(uid)) {
3815                         *exit_status = EXIT_USER;
3816                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
3817                 }
3818
3819                 if (!gid_is_valid(gid)) {
3820                         *exit_status = EXIT_USER;
3821                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
3822                 }
3823
3824                 if (dcreds->user)
3825                         username = dcreds->user->name;
3826
3827         } else {
3828                 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
3829                 if (r < 0) {
3830                         *exit_status = EXIT_USER;
3831                         return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
3832                 }
3833
3834                 r = get_fixed_group(context, &groupname, &gid);
3835                 if (r < 0) {
3836                         *exit_status = EXIT_GROUP;
3837                         return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
3838                 }
3839         }
3840
3841         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
3842         r = get_supplementary_groups(context, username, groupname, gid,
3843                                      &supplementary_gids, &ngids);
3844         if (r < 0) {
3845                 *exit_status = EXIT_GROUP;
3846                 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
3847         }
3848
3849         r = send_user_lookup(unit, user_lookup_fd, uid, gid);
3850         if (r < 0) {
3851                 *exit_status = EXIT_USER;
3852                 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
3853         }
3854
3855         user_lookup_fd = safe_close(user_lookup_fd);
3856
3857         r = acquire_home(context, uid, &home, &home_buffer);
3858         if (r < 0) {
3859                 *exit_status = EXIT_CHDIR;
3860                 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
3861         }
3862
3863         /* If a socket is connected to STDIN/STDOUT/STDERR, we
3864          * must sure to drop O_NONBLOCK */
3865         if (socket_fd >= 0)
3866                 (void) fd_nonblock(socket_fd, false);
3867
3868         /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
3869          * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
3870         if (params->cgroup_path) {
3871                 _cleanup_free_ char *p = NULL;
3872
3873                 r = exec_parameters_get_cgroup_path(params, &p);
3874                 if (r < 0) {
3875                         *exit_status = EXIT_CGROUP;
3876                         return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
3877                 }
3878
3879                 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
3880                 if (r < 0) {
3881                         *exit_status = EXIT_CGROUP;
3882                         return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
3883                 }
3884         }
3885
3886         if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
3887                 r = open_netns_path(runtime->netns_storage_socket, context->network_namespace_path);
3888                 if (r < 0) {
3889                         *exit_status = EXIT_NETWORK;
3890                         return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
3891                 }
3892         }
3893
3894         r = setup_input(context, params, socket_fd, named_iofds);
3895         if (r < 0) {
3896                 *exit_status = EXIT_STDIN;
3897                 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
3898         }
3899
3900         r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3901         if (r < 0) {
3902                 *exit_status = EXIT_STDOUT;
3903                 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
3904         }
3905
3906         r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3907         if (r < 0) {
3908                 *exit_status = EXIT_STDERR;
3909                 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
3910         }
3911
3912         if (context->oom_score_adjust_set) {
3913                 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
3914                  * prohibit write access to this file, and we shouldn't trip up over that. */
3915                 r = set_oom_score_adjust(context->oom_score_adjust);
3916                 if (ERRNO_IS_PRIVILEGE(r))
3917                         log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
3918                 else if (r < 0) {
3919                         *exit_status = EXIT_OOM_ADJUST;
3920                         return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
3921                 }
3922         }
3923
3924         if (context->coredump_filter_set) {
3925                 r = set_coredump_filter(context->coredump_filter);
3926                 if (ERRNO_IS_PRIVILEGE(r))
3927                         log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
3928                 else if (r < 0)
3929                         return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
3930         }
3931
3932         if (context->nice_set) {
3933                 r = setpriority_closest(context->nice);
3934                 if (r < 0)
3935                         return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
3936         }
3937
3938         if (context->cpu_sched_set) {
3939                 struct sched_param param = {
3940                         .sched_priority = context->cpu_sched_priority,
3941                 };
3942
3943                 r = sched_setscheduler(0,
3944                                        context->cpu_sched_policy |
3945                                        (context->cpu_sched_reset_on_fork ?
3946                                         SCHED_RESET_ON_FORK : 0),
3947                                        &param);
3948                 if (r < 0) {
3949                         *exit_status = EXIT_SETSCHEDULER;
3950                         return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
3951                 }
3952         }
3953
3954         if (context->cpu_affinity_from_numa || context->cpu_set.set) {
3955                 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
3956                 const CPUSet *cpu_set;
3957
3958                 if (context->cpu_affinity_from_numa) {
3959                         r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
3960                         if (r < 0) {
3961                                 *exit_status = EXIT_CPUAFFINITY;
3962                                 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
3963                         }
3964
3965                         cpu_set = &converted_cpu_set;
3966                 } else
3967                         cpu_set = &context->cpu_set;
3968
3969                 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
3970                         *exit_status = EXIT_CPUAFFINITY;
3971                         return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
3972                 }
3973         }
3974
3975         if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
3976                 r = apply_numa_policy(&context->numa_policy);
3977                 if (r == -EOPNOTSUPP)
3978                         log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
3979                 else if (r < 0) {
3980                         *exit_status = EXIT_NUMA_POLICY;
3981                         return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
3982                 }
3983         }
3984
3985         if (context->ioprio_set)
3986                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
3987                         *exit_status = EXIT_IOPRIO;
3988                         return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
3989                 }
3990
3991         if (context->timer_slack_nsec != NSEC_INFINITY)
3992                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
3993                         *exit_status = EXIT_TIMERSLACK;
3994                         return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
3995                 }
3996
3997         if (context->personality != PERSONALITY_INVALID) {
3998                 r = safe_personality(context->personality);
3999                 if (r < 0) {
4000                         *exit_status = EXIT_PERSONALITY;
4001                         return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
4002                 }
4003         }
4004
4005         if (context->utmp_id)
4006                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
4007                                       context->tty_path,
4008                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
4009                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4010                                       USER_PROCESS,
4011                                       username);
4012
4013         if (uid_is_valid(uid)) {
4014                 r = chown_terminal(STDIN_FILENO, uid);
4015                 if (r < 0) {
4016                         *exit_status = EXIT_STDIN;
4017                         return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
4018                 }
4019         }
4020
4021         /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
4022          * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4023          * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
4024          * touch a single hierarchy too. */
4025         if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
4026                 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
4027                 if (r < 0) {
4028                         *exit_status = EXIT_CGROUP;
4029                         return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
4030                 }
4031         }
4032
4033         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4034                 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
4035                 if (r < 0)
4036                         return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
4037         }
4038
4039         if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4040                 r = setup_credentials(context, params, unit->id, uid);
4041                 if (r < 0) {
4042                         *exit_status = EXIT_CREDENTIALS;
4043                         return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
4044                 }
4045         }
4046
4047         r = build_environment(
4048                         unit,
4049                         context,
4050                         params,
4051                         n_fds,
4052                         home,
4053                         username,
4054                         shell,
4055                         journal_stream_dev,
4056                         journal_stream_ino,
4057                         &our_env);
4058         if (r < 0) {
4059                 *exit_status = EXIT_MEMORY;
4060                 return log_oom();
4061         }
4062
4063         r = build_pass_environment(context, &pass_env);
4064         if (r < 0) {
4065                 *exit_status = EXIT_MEMORY;
4066                 return log_oom();
4067         }
4068
4069         accum_env = strv_env_merge(5,
4070                                    params->environment,
4071                                    our_env,
4072                                    pass_env,
4073                                    context->environment,
4074                                    files_env);
4075         if (!accum_env) {
4076                 *exit_status = EXIT_MEMORY;
4077                 return log_oom();
4078         }
4079         accum_env = strv_env_clean(accum_env);
4080
4081         (void) umask(context->umask);
4082
4083         r = setup_keyring(unit, context, params, uid, gid);
4084         if (r < 0) {
4085                 *exit_status = EXIT_KEYRING;
4086                 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
4087         }
4088
4089         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
4090         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
4091
4092         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
4093         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
4094
4095         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
4096         if (needs_ambient_hack)
4097                 needs_setuid = false;
4098         else
4099                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4100
4101         if (needs_sandboxing) {
4102                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
4103                  * present. The actual MAC context application will happen later, as late as possible, to avoid
4104                  * impacting our own code paths. */
4105
4106 #if HAVE_SELINUX
4107                 use_selinux = mac_selinux_use();
4108 #endif
4109 #if ENABLE_SMACK
4110                 use_smack = mac_smack_use();
4111 #endif
4112 #if HAVE_APPARMOR
4113                 use_apparmor = mac_apparmor_use();
4114 #endif
4115         }
4116
4117         if (needs_sandboxing) {
4118                 int which_failed;
4119
4120                 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4121                  * is set here. (See below.) */
4122
4123                 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4124                 if (r < 0) {
4125                         *exit_status = EXIT_LIMITS;
4126                         return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4127                 }
4128         }
4129
4130         if (needs_setuid && context->pam_name && username) {
4131                 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
4132                  * wins here. (See above.) */
4133
4134                 /* All fds passed in the fds array will be closed in the pam child process. */
4135                 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
4136                 if (r < 0) {
4137                         *exit_status = EXIT_PAM;
4138                         return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
4139                 }
4140
4141                 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4142                 if (ngids_after_pam < 0) {
4143                         *exit_status = EXIT_MEMORY;
4144                         return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
4145                 }
4146         }
4147
4148         if (needs_sandboxing && context->private_users && !have_effective_cap(CAP_SYS_ADMIN)) {
4149                 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4150                  * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4151                  * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
4152
4153                 userns_set_up = true;
4154                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4155                 if (r < 0) {
4156                         *exit_status = EXIT_USER;
4157                         return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
4158                 }
4159         }
4160
4161         if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
4162
4163                 if (ns_type_supported(NAMESPACE_NET)) {
4164                         r = setup_netns(runtime->netns_storage_socket);
4165                         if (r == -EPERM)
4166                                 log_unit_warning_errno(unit, r,
4167                                                        "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
4168                         else if (r < 0) {
4169                                 *exit_status = EXIT_NETWORK;
4170                                 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
4171                         }
4172                 } else if (context->network_namespace_path) {
4173                         *exit_status = EXIT_NETWORK;
4174                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4175                                                     "NetworkNamespacePath= is not supported, refusing.");
4176                 } else
4177                         log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
4178         }
4179
4180         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
4181         if (needs_mount_namespace) {
4182                 _cleanup_free_ char *error_path = NULL;
4183
4184                 r = apply_mount_namespace(unit, command->flags, context, params, runtime, &error_path);
4185                 if (r < 0) {
4186                         *exit_status = EXIT_NAMESPACE;
4187                         return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
4188                                                     error_path ? ": " : "", strempty(error_path));
4189                 }
4190         }
4191
4192         if (needs_sandboxing) {
4193                 r = apply_protect_hostname(unit, context, exit_status);
4194                 if (r < 0)
4195                         return r;
4196         }
4197
4198         /* Drop groups as early as possible.
4199          * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
4200          * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
4201         if (needs_setuid) {
4202                 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4203                 int ngids_to_enforce = 0;
4204
4205                 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4206                                                    ngids,
4207                                                    gids_after_pam,
4208                                                    ngids_after_pam,
4209                                                    &gids_to_enforce);
4210                 if (ngids_to_enforce < 0) {
4211                         *exit_status = EXIT_MEMORY;
4212                         return log_unit_error_errno(unit,
4213                                                     ngids_to_enforce,
4214                                                     "Failed to merge group lists. Group membership might be incorrect: %m");
4215                 }
4216
4217                 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
4218                 if (r < 0) {
4219                         *exit_status = EXIT_GROUP;
4220                         return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
4221                 }
4222         }
4223
4224         /* If the user namespace was not set up above, try to do it now.
4225          * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4226          * restricted by rules pertaining to combining user namspaces with other namespaces (e.g. in the
4227          * case of mount namespaces being less privileged when the mount point list is copied from a
4228          * different user namespace). */
4229
4230         if (needs_sandboxing && context->private_users && !userns_set_up) {
4231                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4232                 if (r < 0) {
4233                         *exit_status = EXIT_USER;
4234                         return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
4235                 }
4236         }
4237
4238         /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4239          * shall execute. */
4240
4241         _cleanup_free_ char *executable = NULL;
4242         _cleanup_close_ int executable_fd = -1;
4243         r = find_executable_full(command->path, false, &executable, &executable_fd);
4244         if (r < 0) {
4245                 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
4246                         log_struct_errno(LOG_INFO, r,
4247                                          "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4248                                          LOG_UNIT_ID(unit),
4249                                          LOG_UNIT_INVOCATION_ID(unit),
4250                                          LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
4251                                                           command->path),
4252                                          "EXECUTABLE=%s", command->path);
4253                         return 0;
4254                 }
4255
4256                 *exit_status = EXIT_EXEC;
4257                 return log_struct_errno(LOG_INFO, r,
4258                                         "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4259                                         LOG_UNIT_ID(unit),
4260                                         LOG_UNIT_INVOCATION_ID(unit),
4261                                         LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
4262                                                          command->path),
4263                                         "EXECUTABLE=%s", command->path);
4264         }
4265
4266         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
4267         if (r < 0) {
4268                 *exit_status = EXIT_FDS;
4269                 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4270         }
4271
4272 #if HAVE_SELINUX
4273         if (needs_sandboxing && use_selinux && params->selinux_context_net && socket_fd >= 0) {
4274                 r = mac_selinux_get_child_mls_label(socket_fd, executable, context->selinux_context, &mac_selinux_context_net);
4275                 if (r < 0) {
4276                         *exit_status = EXIT_SELINUX_CONTEXT;
4277                         return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
4278                 }
4279         }
4280 #endif
4281
4282         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
4283          * more aggressive this time since socket_fd and the netns fds we don't need anymore. We do keep the exec_fd
4284          * however if we have it as we want to keep it open until the final execve(). */
4285
4286         r = close_all_fds(keep_fds, n_keep_fds);
4287         if (r >= 0)
4288                 r = shift_fds(fds, n_fds);
4289         if (r >= 0)
4290                 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
4291         if (r < 0) {
4292                 *exit_status = EXIT_FDS;
4293                 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
4294         }
4295
4296         /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4297          * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4298          * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4299          * came this far. */
4300
4301         secure_bits = context->secure_bits;
4302
4303         if (needs_sandboxing) {
4304                 uint64_t bset;
4305
4306                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
4307                  * requested. (Note this is placed after the general resource limit initialization, see
4308                  * above, in order to take precedence.) */
4309                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
4310                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
4311                                 *exit_status = EXIT_LIMITS;
4312                                 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
4313                         }
4314                 }
4315
4316 #if ENABLE_SMACK
4317                 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4318                  * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4319                 if (use_smack) {
4320                         r = setup_smack(context, executable_fd);
4321                         if (r < 0) {
4322                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
4323                                 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
4324                         }
4325                 }
4326 #endif
4327
4328                 bset = context->capability_bounding_set;
4329                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4330                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4331                  * instead of us doing that */
4332                 if (needs_ambient_hack)
4333                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
4334                                 (UINT64_C(1) << CAP_SETUID) |
4335                                 (UINT64_C(1) << CAP_SETGID);
4336
4337                 if (!cap_test_all(bset)) {
4338                         r = capability_bounding_set_drop(bset, false);
4339                         if (r < 0) {
4340                                 *exit_status = EXIT_CAPABILITIES;
4341                                 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
4342                         }
4343                 }
4344
4345                 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4346                  * keep-caps set.
4347                  * To be able to raise the ambient capabilities after setresuid() they have to be
4348                  * added to the inherited set and keep caps has to be set (done in enforce_user()).
4349                  * After setresuid() the ambient capabilities can be raised as they are present in
4350                  * the permitted and inhertiable set. However it is possible that someone wants to
4351                  * set ambient capabilities without changing the user, so we also set the ambient
4352                  * capabilities here.
4353                  * The requested ambient capabilities are raised in the inheritable set if the
4354                  * second argument is true. */
4355                 if (!needs_ambient_hack) {
4356                         r = capability_ambient_set_apply(context->capability_ambient_set, true);
4357                         if (r < 0) {
4358                                 *exit_status = EXIT_CAPABILITIES;
4359                                 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
4360                         }
4361                 }
4362         }
4363
4364         /* chroot to root directory first, before we lose the ability to chroot */
4365         r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
4366         if (r < 0)
4367                 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
4368
4369         if (needs_setuid) {
4370                 if (uid_is_valid(uid)) {
4371                         r = enforce_user(context, uid);
4372                         if (r < 0) {
4373                                 *exit_status = EXIT_USER;
4374                                 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
4375                         }
4376
4377                         if (!needs_ambient_hack &&
4378                             context->capability_ambient_set != 0) {
4379
4380                                 /* Raise the ambient capabilities after user change. */
4381                                 r = capability_ambient_set_apply(context->capability_ambient_set, false);
4382                                 if (r < 0) {
4383                                         *exit_status = EXIT_CAPABILITIES;
4384                                         return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
4385                                 }
4386                         }
4387                 }
4388         }
4389
4390         /* Apply working directory here, because the working directory might be on NFS and only the user running
4391          * this service might have the correct privilege to change to the working directory */
4392         r = apply_working_directory(context, params, home, exit_status);
4393         if (r < 0)
4394                 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
4395
4396         if (needs_sandboxing) {
4397                 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
4398                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
4399                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
4400                  * are restricted. */
4401
4402 #if HAVE_SELINUX
4403                 if (use_selinux) {
4404                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
4405
4406                         if (exec_context) {
4407                                 r = setexeccon(exec_context);
4408                                 if (r < 0) {
4409                                         *exit_status = EXIT_SELINUX_CONTEXT;
4410                                         return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
4411                                 }
4412                         }
4413                 }
4414 #endif
4415
4416 #if HAVE_APPARMOR
4417                 if (use_apparmor && context->apparmor_profile) {
4418                         r = aa_change_onexec(context->apparmor_profile);
4419                         if (r < 0 && !context->apparmor_profile_ignore) {
4420                                 *exit_status = EXIT_APPARMOR_PROFILE;
4421                                 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
4422                         }
4423                 }
4424 #endif
4425
4426                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
4427                  * we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits requires
4428                  * CAP_SETPCAP. */
4429                 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
4430                         /* CAP_SETPCAP is required to set securebits. This capability is raised into the
4431                          * effective set here.
4432                          * The effective set is overwritten during execve  with the following  values:
4433                          * - ambient set (for non-root processes)
4434                          * - (inheritable | bounding) set for root processes)
4435                          *
4436                          * Hence there is no security impact to raise it in the effective set before execve
4437                          */
4438                         r = capability_gain_cap_setpcap(NULL);
4439                         if (r < 0) {
4440                                 *exit_status = EXIT_CAPABILITIES;
4441                                 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
4442                         }
4443                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
4444                                 *exit_status = EXIT_SECUREBITS;
4445                                 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
4446                         }
4447                 }
4448
4449                 if (context_has_no_new_privileges(context))
4450                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
4451                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
4452                                 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
4453                         }
4454
4455 #if HAVE_SECCOMP
4456                 r = apply_address_families(unit, context);
4457                 if (r < 0) {
4458                         *exit_status = EXIT_ADDRESS_FAMILIES;
4459                         return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
4460                 }
4461
4462                 r = apply_memory_deny_write_execute(unit, context);
4463                 if (r < 0) {
4464                         *exit_status = EXIT_SECCOMP;
4465                         return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
4466                 }
4467
4468                 r = apply_restrict_realtime(unit, context);
4469                 if (r < 0) {
4470                         *exit_status = EXIT_SECCOMP;
4471                         return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
4472                 }
4473
4474                 r = apply_restrict_suid_sgid(unit, context);
4475                 if (r < 0) {
4476                         *exit_status = EXIT_SECCOMP;
4477                         return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
4478                 }
4479
4480                 r = apply_restrict_namespaces(unit, context);
4481                 if (r < 0) {
4482                         *exit_status = EXIT_SECCOMP;
4483                         return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
4484                 }
4485
4486                 r = apply_protect_sysctl(unit, context);
4487                 if (r < 0) {
4488                         *exit_status = EXIT_SECCOMP;
4489                         return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
4490                 }
4491
4492                 r = apply_protect_kernel_modules(unit, context);
4493                 if (r < 0) {
4494                         *exit_status = EXIT_SECCOMP;
4495                         return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
4496                 }
4497
4498                 r = apply_protect_kernel_logs(unit, context);
4499                 if (r < 0) {
4500                         *exit_status = EXIT_SECCOMP;
4501                         return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
4502                 }
4503
4504                 r = apply_protect_clock(unit, context);
4505                 if (r < 0) {
4506                         *exit_status = EXIT_SECCOMP;
4507                         return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
4508                 }
4509
4510                 r = apply_private_devices(unit, context);
4511                 if (r < 0) {
4512                         *exit_status = EXIT_SECCOMP;
4513                         return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
4514                 }
4515
4516                 r = apply_syscall_archs(unit, context);
4517                 if (r < 0) {
4518                         *exit_status = EXIT_SECCOMP;
4519                         return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
4520                 }
4521
4522                 r = apply_lock_personality(unit, context);
4523                 if (r < 0) {
4524                         *exit_status = EXIT_SECCOMP;
4525                         return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
4526                 }
4527
4528                 r = apply_syscall_log(unit, context);
4529                 if (r < 0) {
4530                         *exit_status = EXIT_SECCOMP;
4531                         return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
4532                 }
4533
4534                 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
4535                  * by the filter as little as possible. */
4536                 r = apply_syscall_filter(unit, context, needs_ambient_hack);
4537                 if (r < 0) {
4538                         *exit_status = EXIT_SECCOMP;
4539                         return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
4540                 }
4541 #endif
4542         }
4543
4544         if (!strv_isempty(context->unset_environment)) {
4545                 char **ee = NULL;
4546
4547                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
4548                 if (!ee) {
4549                         *exit_status = EXIT_MEMORY;
4550                         return log_oom();
4551                 }
4552
4553                 strv_free_and_replace(accum_env, ee);
4554         }
4555
4556         if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
4557                 replaced_argv = replace_env_argv(command->argv, accum_env);
4558                 if (!replaced_argv) {
4559                         *exit_status = EXIT_MEMORY;
4560                         return log_oom();
4561                 }
4562                 final_argv = replaced_argv;
4563         } else
4564                 final_argv = command->argv;
4565
4566         if (DEBUG_LOGGING) {
4567                 _cleanup_free_ char *line;
4568
4569                 line = exec_command_line(final_argv);
4570                 if (line)
4571                         log_struct(LOG_DEBUG,
4572                                    "EXECUTABLE=%s", executable,
4573                                    LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
4574                                    LOG_UNIT_ID(unit),
4575                                    LOG_UNIT_INVOCATION_ID(unit));
4576         }
4577
4578         if (exec_fd >= 0) {
4579                 uint8_t hot = 1;
4580
4581                 /* We have finished with all our initializations. Let's now let the manager know that. From this point
4582                  * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
4583
4584                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
4585                         *exit_status = EXIT_EXEC;
4586                         return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
4587                 }
4588         }
4589
4590         r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
4591
4592         if (exec_fd >= 0) {
4593                 uint8_t hot = 0;
4594
4595                 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
4596                  * that POLLHUP on it no longer means execve() succeeded. */
4597
4598                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
4599                         *exit_status = EXIT_EXEC;
4600                         return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
4601                 }
4602         }
4603
4604         *exit_status = EXIT_EXEC;
4605         return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
4606 }
4607
4608 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
4609 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
4610
4611 int exec_spawn(Unit *unit,
4612                ExecCommand *command,
4613                const ExecContext *context,
4614                const ExecParameters *params,
4615                ExecRuntime *runtime,
4616                DynamicCreds *dcreds,
4617                pid_t *ret) {
4618
4619         int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
4620         _cleanup_free_ char *subcgroup_path = NULL;
4621         _cleanup_strv_free_ char **files_env = NULL;
4622         size_t n_storage_fds = 0, n_socket_fds = 0;
4623         _cleanup_free_ char *line = NULL;
4624         pid_t pid;
4625
4626         assert(unit);
4627         assert(command);
4628         assert(context);
4629         assert(ret);
4630         assert(params);
4631         assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
4632
4633         if (context->std_input == EXEC_INPUT_SOCKET ||
4634             context->std_output == EXEC_OUTPUT_SOCKET ||
4635             context->std_error == EXEC_OUTPUT_SOCKET) {
4636
4637                 if (params->n_socket_fds > 1)
4638                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
4639
4640                 if (params->n_socket_fds == 0)
4641                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
4642
4643                 socket_fd = params->fds[0];
4644         } else {
4645                 socket_fd = -1;
4646                 fds = params->fds;
4647                 n_socket_fds = params->n_socket_fds;
4648                 n_storage_fds = params->n_storage_fds;
4649         }
4650
4651         r = exec_context_named_iofds(context, params, named_iofds);
4652         if (r < 0)
4653                 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
4654
4655         r = exec_context_load_environment(unit, context, &files_env);
4656         if (r < 0)
4657                 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
4658
4659         line = exec_command_line(command->argv);
4660         if (!line)
4661                 return log_oom();
4662
4663         /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
4664            and, until the next SELinux policy changes, we save further reloads in future children. */
4665         mac_selinux_maybe_reload();
4666
4667         log_struct(LOG_DEBUG,
4668                    LOG_UNIT_MESSAGE(unit, "About to execute %s", line),
4669                    "EXECUTABLE=%s", command->path, /* We won't know the real executable path until we create
4670                                                       the mount namespace in the child, but we want to log
4671                                                       from the parent, so we need to use the (possibly
4672                                                       inaccurate) path here. */
4673                    LOG_UNIT_ID(unit),
4674                    LOG_UNIT_INVOCATION_ID(unit));
4675
4676         if (params->cgroup_path) {
4677                 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
4678                 if (r < 0)
4679                         return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
4680                 if (r > 0) { /* We are using a child cgroup */
4681                         r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
4682                         if (r < 0)
4683                                 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
4684                 }
4685         }
4686
4687         pid = fork();
4688         if (pid < 0)
4689                 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
4690
4691         if (pid == 0) {
4692                 int exit_status = EXIT_SUCCESS;
4693
4694                 r = exec_child(unit,
4695                                command,
4696                                context,
4697                                params,
4698                                runtime,
4699                                dcreds,
4700                                socket_fd,
4701                                named_iofds,
4702                                fds,
4703                                n_socket_fds,
4704                                n_storage_fds,
4705                                files_env,
4706                                unit->manager->user_lookup_fds[1],
4707                                &exit_status);
4708
4709                 if (r < 0) {
4710                         const char *status =
4711                                 exit_status_to_string(exit_status,
4712                                                       EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
4713
4714                         log_struct_errno(LOG_ERR, r,
4715                                          "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4716                                          LOG_UNIT_ID(unit),
4717                                          LOG_UNIT_INVOCATION_ID(unit),
4718                                          LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
4719                                                           status, command->path),
4720                                          "EXECUTABLE=%s", command->path);
4721                 }
4722
4723                 _exit(exit_status);
4724         }
4725
4726         log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
4727
4728         /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
4729          * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
4730          * process will be killed too). */
4731         if (subcgroup_path)
4732                 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
4733
4734         exec_status_start(&command->exec_status, pid);
4735
4736         *ret = pid;
4737         return 0;
4738 }
4739
4740 void exec_context_init(ExecContext *c) {
4741         assert(c);
4742
4743         c->umask = 0022;
4744         c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
4745         c->cpu_sched_policy = SCHED_OTHER;
4746         c->syslog_priority = LOG_DAEMON|LOG_INFO;
4747         c->syslog_level_prefix = true;
4748         c->ignore_sigpipe = true;
4749         c->timer_slack_nsec = NSEC_INFINITY;
4750         c->personality = PERSONALITY_INVALID;
4751         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
4752                 c->directories[t].mode = 0755;
4753         c->timeout_clean_usec = USEC_INFINITY;
4754         c->capability_bounding_set = CAP_ALL;
4755         assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
4756         c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
4757         c->log_level_max = -1;
4758 #if HAVE_SECCOMP
4759         c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL;
4760 #endif
4761         numa_policy_reset(&c->numa_policy);
4762 }
4763
4764 void exec_context_done(ExecContext *c) {
4765         assert(c);
4766
4767         c->environment = strv_free(c->environment);
4768         c->environment_files = strv_free(c->environment_files);
4769         c->pass_environment = strv_free(c->pass_environment);
4770         c->unset_environment = strv_free(c->unset_environment);
4771
4772         rlimit_free_all(c->rlimit);
4773
4774         for (size_t l = 0; l < 3; l++) {
4775                 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
4776                 c->stdio_file[l] = mfree(c->stdio_file[l]);
4777         }
4778
4779         c->working_directory = mfree(c->working_directory);
4780         c->root_directory = mfree(c->root_directory);
4781         c->root_image = mfree(c->root_image);
4782         c->root_image_options = mount_options_free_all(c->root_image_options);
4783         c->root_hash = mfree(c->root_hash);
4784         c->root_hash_size = 0;
4785         c->root_hash_path = mfree(c->root_hash_path);
4786         c->root_hash_sig = mfree(c->root_hash_sig);
4787         c->root_hash_sig_size = 0;
4788         c->root_hash_sig_path = mfree(c->root_hash_sig_path);
4789         c->root_verity = mfree(c->root_verity);
4790         c->tty_path = mfree(c->tty_path);
4791         c->syslog_identifier = mfree(c->syslog_identifier);
4792         c->user = mfree(c->user);
4793         c->group = mfree(c->group);
4794
4795         c->supplementary_groups = strv_free(c->supplementary_groups);
4796
4797         c->pam_name = mfree(c->pam_name);
4798
4799         c->read_only_paths = strv_free(c->read_only_paths);
4800         c->read_write_paths = strv_free(c->read_write_paths);
4801         c->inaccessible_paths = strv_free(c->inaccessible_paths);
4802
4803         bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
4804         c->bind_mounts = NULL;
4805         c->n_bind_mounts = 0;
4806         temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
4807         c->temporary_filesystems = NULL;
4808         c->n_temporary_filesystems = 0;
4809         c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
4810
4811         cpu_set_reset(&c->cpu_set);
4812         numa_policy_reset(&c->numa_policy);
4813
4814         c->utmp_id = mfree(c->utmp_id);
4815         c->selinux_context = mfree(c->selinux_context);
4816         c->apparmor_profile = mfree(c->apparmor_profile);
4817         c->smack_process_label = mfree(c->smack_process_label);
4818
4819         c->syscall_filter = hashmap_free(c->syscall_filter);
4820         c->syscall_archs = set_free(c->syscall_archs);
4821         c->address_families = set_free(c->address_families);
4822
4823         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
4824                 c->directories[t].paths = strv_free(c->directories[t].paths);
4825
4826         c->log_level_max = -1;
4827
4828         exec_context_free_log_extra_fields(c);
4829
4830         c->log_ratelimit_interval_usec = 0;
4831         c->log_ratelimit_burst = 0;
4832
4833         c->stdin_data = mfree(c->stdin_data);
4834         c->stdin_data_size = 0;
4835
4836         c->network_namespace_path = mfree(c->network_namespace_path);
4837
4838         c->log_namespace = mfree(c->log_namespace);
4839
4840         c->load_credentials = strv_free(c->load_credentials);
4841         c->set_credentials = hashmap_free(c->set_credentials);
4842 }
4843
4844 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
4845         char **i;
4846
4847         assert(c);
4848
4849         if (!runtime_prefix)
4850                 return 0;
4851
4852         STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
4853                 _cleanup_free_ char *p;
4854
4855                 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
4856                         p = path_join(runtime_prefix, "private", *i);
4857                 else
4858                         p = path_join(runtime_prefix, *i);
4859                 if (!p)
4860                         return -ENOMEM;
4861
4862                 /* We execute this synchronously, since we need to be sure this is gone when we start the
4863                  * service next. */
4864                 (void) rm_rf(p, REMOVE_ROOT);
4865         }
4866
4867         return 0;
4868 }
4869
4870 int exec_context_destroy_credentials(const ExecContext *c, const char *runtime_prefix, const char *unit) {
4871         _cleanup_free_ char *p = NULL;
4872
4873         assert(c);
4874
4875         if (!runtime_prefix || !unit)
4876                 return 0;
4877
4878         p = path_join(runtime_prefix, "credentials", unit);
4879         if (!p)
4880                 return -ENOMEM;
4881
4882         /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
4883          * unmount it, and afterwards remove the mount point */
4884         (void) umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW);
4885         (void) rm_rf(p, REMOVE_ROOT|REMOVE_CHMOD);
4886
4887         return 0;
4888 }
4889
4890 static void exec_command_done(ExecCommand *c) {
4891         assert(c);
4892
4893         c->path = mfree(c->path);
4894         c->argv = strv_free(c->argv);
4895 }
4896
4897 void exec_command_done_array(ExecCommand *c, size_t n) {
4898         size_t i;
4899
4900         for (i = 0; i < n; i++)
4901                 exec_command_done(c+i);
4902 }
4903
4904 ExecCommand* exec_command_free_list(ExecCommand *c) {
4905         ExecCommand *i;
4906
4907         while ((i = c)) {
4908                 LIST_REMOVE(command, c, i);
4909                 exec_command_done(i);
4910                 free(i);
4911         }
4912
4913         return NULL;
4914 }
4915
4916 void exec_command_free_array(ExecCommand **c, size_t n) {
4917         for (size_t i = 0; i < n; i++)
4918                 c[i] = exec_command_free_list(c[i]);
4919 }
4920
4921 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
4922         for (size_t i = 0; i < n; i++)
4923                 exec_status_reset(&c[i].exec_status);
4924 }
4925
4926 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
4927         for (size_t i = 0; i < n; i++) {
4928                 ExecCommand *z;
4929
4930                 LIST_FOREACH(command, z, c[i])
4931                         exec_status_reset(&z->exec_status);
4932         }
4933 }
4934
4935 typedef struct InvalidEnvInfo {
4936         const Unit *unit;
4937         const char *path;
4938 } InvalidEnvInfo;
4939
4940 static void invalid_env(const char *p, void *userdata) {
4941         InvalidEnvInfo *info = userdata;
4942
4943         log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
4944 }
4945
4946 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
4947         assert(c);
4948
4949         switch (fd_index) {
4950
4951         case STDIN_FILENO:
4952                 if (c->std_input != EXEC_INPUT_NAMED_FD)
4953                         return NULL;
4954
4955                 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
4956
4957         case STDOUT_FILENO:
4958                 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
4959                         return NULL;
4960
4961                 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
4962
4963         case STDERR_FILENO:
4964                 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
4965                         return NULL;
4966
4967                 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
4968
4969         default:
4970                 return NULL;
4971         }
4972 }
4973
4974 static int exec_context_named_iofds(
4975                 const ExecContext *c,
4976                 const ExecParameters *p,
4977                 int named_iofds[static 3]) {
4978
4979         size_t targets;
4980         const char* stdio_fdname[3];
4981         size_t n_fds;
4982
4983         assert(c);
4984         assert(p);
4985         assert(named_iofds);
4986
4987         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
4988                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
4989                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
4990
4991         for (size_t i = 0; i < 3; i++)
4992                 stdio_fdname[i] = exec_context_fdname(c, i);
4993
4994         n_fds = p->n_storage_fds + p->n_socket_fds;
4995
4996         for (size_t i = 0; i < n_fds  && targets > 0; i++)
4997                 if (named_iofds[STDIN_FILENO] < 0 &&
4998                     c->std_input == EXEC_INPUT_NAMED_FD &&
4999                     stdio_fdname[STDIN_FILENO] &&
5000                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
5001
5002                         named_iofds[STDIN_FILENO] = p->fds[i];
5003                         targets--;
5004
5005                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
5006                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
5007                            stdio_fdname[STDOUT_FILENO] &&
5008                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
5009
5010                         named_iofds[STDOUT_FILENO] = p->fds[i];
5011                         targets--;
5012
5013                 } else if (named_iofds[STDERR_FILENO] < 0 &&
5014                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
5015                            stdio_fdname[STDERR_FILENO] &&
5016                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
5017
5018                         named_iofds[STDERR_FILENO] = p->fds[i];
5019                         targets--;
5020                 }
5021
5022         return targets == 0 ? 0 : -ENOENT;
5023 }
5024
5025 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
5026         char **i, **r = NULL;
5027
5028         assert(c);
5029         assert(l);
5030
5031         STRV_FOREACH(i, c->environment_files) {
5032                 char *fn;
5033                 int k;
5034                 bool ignore = false;
5035                 char **p;
5036                 _cleanup_globfree_ glob_t pglob = {};
5037
5038                 fn = *i;
5039
5040                 if (fn[0] == '-') {
5041                         ignore = true;
5042                         fn++;
5043                 }
5044
5045                 if (!path_is_absolute(fn)) {
5046                         if (ignore)
5047                                 continue;
5048
5049                         strv_free(r);
5050                         return -EINVAL;
5051                 }
5052
5053                 /* Filename supports globbing, take all matching files */
5054                 k = safe_glob(fn, 0, &pglob);
5055                 if (k < 0) {
5056                         if (ignore)
5057                                 continue;
5058
5059                         strv_free(r);
5060                         return k;
5061                 }
5062
5063                 /* When we don't match anything, -ENOENT should be returned */
5064                 assert(pglob.gl_pathc > 0);
5065
5066                 for (unsigned n = 0; n < pglob.gl_pathc; n++) {
5067                         k = load_env_file(NULL, pglob.gl_pathv[n], &p);
5068                         if (k < 0) {
5069                                 if (ignore)
5070                                         continue;
5071
5072                                 strv_free(r);
5073                                 return k;
5074                         }
5075                         /* Log invalid environment variables with filename */
5076                         if (p) {
5077                                 InvalidEnvInfo info = {
5078                                         .unit = unit,
5079                                         .path = pglob.gl_pathv[n]
5080                                 };
5081
5082                                 p = strv_env_clean_with_callback(p, invalid_env, &info);
5083                         }
5084
5085                         if (!r)
5086                                 r = p;
5087                         else {
5088                                 char **m;
5089
5090                                 m = strv_env_merge(2, r, p);
5091                                 strv_free(r);
5092                                 strv_free(p);
5093                                 if (!m)
5094                                         return -ENOMEM;
5095
5096                                 r = m;
5097                         }
5098                 }
5099         }
5100
5101         *l = r;
5102
5103         return 0;
5104 }
5105
5106 static bool tty_may_match_dev_console(const char *tty) {
5107         _cleanup_free_ char *resolved = NULL;
5108
5109         if (!tty)
5110                 return true;
5111
5112         tty = skip_dev_prefix(tty);
5113
5114         /* trivial identity? */
5115         if (streq(tty, "console"))
5116                 return true;
5117
5118         if (resolve_dev_console(&resolved) < 0)
5119                 return true; /* if we could not resolve, assume it may */
5120
5121         /* "tty0" means the active VC, so it may be the same sometimes */
5122         return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
5123 }
5124
5125 static bool exec_context_may_touch_tty(const ExecContext *ec) {
5126         assert(ec);
5127
5128         return ec->tty_reset ||
5129                 ec->tty_vhangup ||
5130                 ec->tty_vt_disallocate ||
5131                 is_terminal_input(ec->std_input) ||
5132                 is_terminal_output(ec->std_output) ||
5133                 is_terminal_output(ec->std_error);
5134 }
5135
5136 bool exec_context_may_touch_console(const ExecContext *ec) {
5137
5138         return exec_context_may_touch_tty(ec) &&
5139                tty_may_match_dev_console(exec_context_tty_path(ec));
5140 }
5141
5142 static void strv_fprintf(FILE *f, char **l) {
5143         char **g;
5144
5145         assert(f);
5146
5147         STRV_FOREACH(g, l)
5148                 fprintf(f, " %s", *g);
5149 }
5150
5151 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
5152         char **e, **d, buf_clean[FORMAT_TIMESPAN_MAX];
5153         int r;
5154
5155         assert(c);
5156         assert(f);
5157
5158         prefix = strempty(prefix);
5159
5160         fprintf(f,
5161                 "%sUMask: %04o\n"
5162                 "%sWorkingDirectory: %s\n"
5163                 "%sRootDirectory: %s\n"
5164                 "%sNonBlocking: %s\n"
5165                 "%sPrivateTmp: %s\n"
5166                 "%sPrivateDevices: %s\n"
5167                 "%sProtectKernelTunables: %s\n"
5168                 "%sProtectKernelModules: %s\n"
5169                 "%sProtectKernelLogs: %s\n"
5170                 "%sProtectClock: %s\n"
5171                 "%sProtectControlGroups: %s\n"
5172                 "%sPrivateNetwork: %s\n"
5173                 "%sPrivateUsers: %s\n"
5174                 "%sProtectHome: %s\n"
5175                 "%sProtectSystem: %s\n"
5176                 "%sMountAPIVFS: %s\n"
5177                 "%sIgnoreSIGPIPE: %s\n"
5178                 "%sMemoryDenyWriteExecute: %s\n"
5179                 "%sRestrictRealtime: %s\n"
5180                 "%sRestrictSUIDSGID: %s\n"
5181                 "%sKeyringMode: %s\n"
5182                 "%sProtectHostname: %s\n"
5183                 "%sProtectProc: %s\n"
5184                 "%sProcSubset: %s\n",
5185                 prefix, c->umask,
5186                 prefix, empty_to_root(c->working_directory),
5187                 prefix, empty_to_root(c->root_directory),
5188                 prefix, yes_no(c->non_blocking),
5189                 prefix, yes_no(c->private_tmp),
5190                 prefix, yes_no(c->private_devices),
5191                 prefix, yes_no(c->protect_kernel_tunables),
5192                 prefix, yes_no(c->protect_kernel_modules),
5193                 prefix, yes_no(c->protect_kernel_logs),
5194                 prefix, yes_no(c->protect_clock),
5195                 prefix, yes_no(c->protect_control_groups),
5196                 prefix, yes_no(c->private_network),
5197                 prefix, yes_no(c->private_users),
5198                 prefix, protect_home_to_string(c->protect_home),
5199                 prefix, protect_system_to_string(c->protect_system),
5200                 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
5201                 prefix, yes_no(c->ignore_sigpipe),
5202                 prefix, yes_no(c->memory_deny_write_execute),
5203                 prefix, yes_no(c->restrict_realtime),
5204                 prefix, yes_no(c->restrict_suid_sgid),
5205                 prefix, exec_keyring_mode_to_string(c->keyring_mode),
5206                 prefix, yes_no(c->protect_hostname),
5207                 prefix, protect_proc_to_string(c->protect_proc),
5208                 prefix, proc_subset_to_string(c->proc_subset));
5209
5210         if (c->root_image)
5211                 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
5212
5213         if (c->root_image_options) {
5214                 MountOptions *o;
5215
5216                 fprintf(f, "%sRootImageOptions:", prefix);
5217                 LIST_FOREACH(mount_options, o, c->root_image_options)
5218                         if (!isempty(o->options))
5219                                 fprintf(f, " %s:%s",
5220                                         partition_designator_to_string(o->partition_designator),
5221                                         o->options);
5222                 fprintf(f, "\n");
5223         }
5224
5225         if (c->root_hash) {
5226                 _cleanup_free_ char *encoded = NULL;
5227                 encoded = hexmem(c->root_hash, c->root_hash_size);
5228                 if (encoded)
5229                         fprintf(f, "%sRootHash: %s\n", prefix, encoded);
5230         }
5231
5232         if (c->root_hash_path)
5233                 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
5234
5235         if (c->root_hash_sig) {
5236                 _cleanup_free_ char *encoded = NULL;
5237                 ssize_t len;
5238                 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
5239                 if (len)
5240                         fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
5241         }
5242
5243         if (c->root_hash_sig_path)
5244                 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
5245
5246         if (c->root_verity)
5247                 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
5248
5249         STRV_FOREACH(e, c->environment)
5250                 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
5251
5252         STRV_FOREACH(e, c->environment_files)
5253                 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
5254
5255         STRV_FOREACH(e, c->pass_environment)
5256                 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
5257
5258         STRV_FOREACH(e, c->unset_environment)
5259                 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
5260
5261         fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
5262
5263         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
5264                 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
5265
5266                 STRV_FOREACH(d, c->directories[dt].paths)
5267                         fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
5268         }
5269
5270         fprintf(f,
5271                 "%sTimeoutCleanSec: %s\n",
5272                 prefix, format_timespan(buf_clean, sizeof(buf_clean), c->timeout_clean_usec, USEC_PER_SEC));
5273
5274         if (c->nice_set)
5275                 fprintf(f,
5276                         "%sNice: %i\n",
5277                         prefix, c->nice);
5278
5279         if (c->oom_score_adjust_set)
5280                 fprintf(f,
5281                         "%sOOMScoreAdjust: %i\n",
5282                         prefix, c->oom_score_adjust);
5283
5284         if (c->coredump_filter_set)
5285                 fprintf(f,
5286                         "%sCoredumpFilter: 0x%"PRIx64"\n",
5287                         prefix, c->coredump_filter);
5288
5289         for (unsigned i = 0; i < RLIM_NLIMITS; i++)
5290                 if (c->rlimit[i]) {
5291                         fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
5292                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
5293                         fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
5294                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
5295                 }
5296
5297         if (c->ioprio_set) {
5298                 _cleanup_free_ char *class_str = NULL;
5299
5300                 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
5301                 if (r >= 0)
5302                         fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
5303
5304                 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
5305         }
5306
5307         if (c->cpu_sched_set) {
5308                 _cleanup_free_ char *policy_str = NULL;
5309
5310                 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
5311                 if (r >= 0)
5312                         fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
5313
5314                 fprintf(f,
5315                         "%sCPUSchedulingPriority: %i\n"
5316                         "%sCPUSchedulingResetOnFork: %s\n",
5317                         prefix, c->cpu_sched_priority,
5318                         prefix, yes_no(c->cpu_sched_reset_on_fork));
5319         }
5320
5321         if (c->cpu_set.set) {
5322                 _cleanup_free_ char *affinity = NULL;
5323
5324                 affinity = cpu_set_to_range_string(&c->cpu_set);
5325                 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
5326         }
5327
5328         if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
5329                 _cleanup_free_ char *nodes = NULL;
5330
5331                 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
5332                 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
5333                 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
5334         }
5335
5336         if (c->timer_slack_nsec != NSEC_INFINITY)
5337                 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
5338
5339         fprintf(f,
5340                 "%sStandardInput: %s\n"
5341                 "%sStandardOutput: %s\n"
5342                 "%sStandardError: %s\n",
5343                 prefix, exec_input_to_string(c->std_input),
5344                 prefix, exec_output_to_string(c->std_output),
5345                 prefix, exec_output_to_string(c->std_error));
5346
5347         if (c->std_input == EXEC_INPUT_NAMED_FD)
5348                 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
5349         if (c->std_output == EXEC_OUTPUT_NAMED_FD)
5350                 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
5351         if (c->std_error == EXEC_OUTPUT_NAMED_FD)
5352                 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
5353
5354         if (c->std_input == EXEC_INPUT_FILE)
5355                 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
5356         if (c->std_output == EXEC_OUTPUT_FILE)
5357                 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5358         if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
5359                 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5360         if (c->std_error == EXEC_OUTPUT_FILE)
5361                 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5362         if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
5363                 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5364
5365         if (c->tty_path)
5366                 fprintf(f,
5367                         "%sTTYPath: %s\n"
5368                         "%sTTYReset: %s\n"
5369                         "%sTTYVHangup: %s\n"
5370                         "%sTTYVTDisallocate: %s\n",
5371                         prefix, c->tty_path,
5372                         prefix, yes_no(c->tty_reset),
5373                         prefix, yes_no(c->tty_vhangup),
5374                         prefix, yes_no(c->tty_vt_disallocate));
5375
5376         if (IN_SET(c->std_output,
5377                    EXEC_OUTPUT_KMSG,
5378                    EXEC_OUTPUT_JOURNAL,
5379                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
5380                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
5381             IN_SET(c->std_error,
5382                    EXEC_OUTPUT_KMSG,
5383                    EXEC_OUTPUT_JOURNAL,
5384                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
5385                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
5386
5387                 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
5388
5389                 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
5390                 if (r >= 0)
5391                         fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
5392
5393                 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
5394                 if (r >= 0)
5395                         fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
5396         }
5397
5398         if (c->log_level_max >= 0) {
5399                 _cleanup_free_ char *t = NULL;
5400
5401                 (void) log_level_to_string_alloc(c->log_level_max, &t);
5402
5403                 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
5404         }
5405
5406         if (c->log_ratelimit_interval_usec > 0) {
5407                 char buf_timespan[FORMAT_TIMESPAN_MAX];
5408
5409                 fprintf(f,
5410                         "%sLogRateLimitIntervalSec: %s\n",
5411                         prefix, format_timespan(buf_timespan, sizeof(buf_timespan), c->log_ratelimit_interval_usec, USEC_PER_SEC));
5412         }
5413
5414         if (c->log_ratelimit_burst > 0)
5415                 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
5416
5417         for (size_t j = 0; j < c->n_log_extra_fields; j++) {
5418                 fprintf(f, "%sLogExtraFields: ", prefix);
5419                 fwrite(c->log_extra_fields[j].iov_base,
5420                        1, c->log_extra_fields[j].iov_len,
5421                        f);
5422                 fputc('\n', f);
5423         }
5424
5425         if (c->log_namespace)
5426                 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
5427
5428         if (c->secure_bits) {
5429                 _cleanup_free_ char *str = NULL;
5430
5431                 r = secure_bits_to_string_alloc(c->secure_bits, &str);
5432                 if (r >= 0)
5433                         fprintf(f, "%sSecure Bits: %s\n", prefix, str);
5434         }
5435
5436         if (c->capability_bounding_set != CAP_ALL) {
5437                 _cleanup_free_ char *str = NULL;
5438
5439                 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
5440                 if (r >= 0)
5441                         fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
5442         }
5443
5444         if (c->capability_ambient_set != 0) {
5445                 _cleanup_free_ char *str = NULL;
5446
5447                 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
5448                 if (r >= 0)
5449                         fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
5450         }
5451
5452         if (c->user)
5453                 fprintf(f, "%sUser: %s\n", prefix, c->user);
5454         if (c->group)
5455                 fprintf(f, "%sGroup: %s\n", prefix, c->group);
5456
5457         fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
5458
5459         if (!strv_isempty(c->supplementary_groups)) {
5460                 fprintf(f, "%sSupplementaryGroups:", prefix);
5461                 strv_fprintf(f, c->supplementary_groups);
5462                 fputs("\n", f);
5463         }
5464
5465         if (c->pam_name)
5466                 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
5467
5468         if (!strv_isempty(c->read_write_paths)) {
5469                 fprintf(f, "%sReadWritePaths:", prefix);
5470                 strv_fprintf(f, c->read_write_paths);
5471                 fputs("\n", f);
5472         }
5473
5474         if (!strv_isempty(c->read_only_paths)) {
5475                 fprintf(f, "%sReadOnlyPaths:", prefix);
5476                 strv_fprintf(f, c->read_only_paths);
5477                 fputs("\n", f);
5478         }
5479
5480         if (!strv_isempty(c->inaccessible_paths)) {
5481                 fprintf(f, "%sInaccessiblePaths:", prefix);
5482                 strv_fprintf(f, c->inaccessible_paths);
5483                 fputs("\n", f);
5484         }
5485
5486         for (size_t i = 0; i < c->n_bind_mounts; i++)
5487                 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
5488                         c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
5489                         c->bind_mounts[i].ignore_enoent ? "-": "",
5490                         c->bind_mounts[i].source,
5491                         c->bind_mounts[i].destination,
5492                         c->bind_mounts[i].recursive ? "rbind" : "norbind");
5493
5494         for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
5495                 const TemporaryFileSystem *t = c->temporary_filesystems + i;
5496
5497                 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
5498                         t->path,
5499                         isempty(t->options) ? "" : ":",
5500                         strempty(t->options));
5501         }
5502
5503         if (c->utmp_id)
5504                 fprintf(f,
5505                         "%sUtmpIdentifier: %s\n",
5506                         prefix, c->utmp_id);
5507
5508         if (c->selinux_context)
5509                 fprintf(f,
5510                         "%sSELinuxContext: %s%s\n",
5511                         prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
5512
5513         if (c->apparmor_profile)
5514                 fprintf(f,
5515                         "%sAppArmorProfile: %s%s\n",
5516                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
5517
5518         if (c->smack_process_label)
5519                 fprintf(f,
5520                         "%sSmackProcessLabel: %s%s\n",
5521                         prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
5522
5523         if (c->personality != PERSONALITY_INVALID)
5524                 fprintf(f,
5525                         "%sPersonality: %s\n",
5526                         prefix, strna(personality_to_string(c->personality)));
5527
5528         fprintf(f,
5529                 "%sLockPersonality: %s\n",
5530                 prefix, yes_no(c->lock_personality));
5531
5532         if (c->syscall_filter) {
5533 #if HAVE_SECCOMP
5534                 void *id, *val;
5535                 bool first = true;
5536 #endif
5537
5538                 fprintf(f,
5539                         "%sSystemCallFilter: ",
5540                         prefix);
5541
5542                 if (!c->syscall_allow_list)
5543                         fputc('~', f);
5544
5545 #if HAVE_SECCOMP
5546                 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
5547                         _cleanup_free_ char *name = NULL;
5548                         const char *errno_name = NULL;
5549                         int num = PTR_TO_INT(val);
5550
5551                         if (first)
5552                                 first = false;
5553                         else
5554                                 fputc(' ', f);
5555
5556                         name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
5557                         fputs(strna(name), f);
5558
5559                         if (num >= 0) {
5560                                 errno_name = seccomp_errno_or_action_to_string(num);
5561                                 if (errno_name)
5562                                         fprintf(f, ":%s", errno_name);
5563                                 else
5564                                         fprintf(f, ":%d", num);
5565                         }
5566                 }
5567 #endif
5568
5569                 fputc('\n', f);
5570         }
5571
5572         if (c->syscall_archs) {
5573 #if HAVE_SECCOMP
5574                 void *id;
5575 #endif
5576
5577                 fprintf(f,
5578                         "%sSystemCallArchitectures:",
5579                         prefix);
5580
5581 #if HAVE_SECCOMP
5582                 SET_FOREACH(id, c->syscall_archs)
5583                         fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
5584 #endif
5585                 fputc('\n', f);
5586         }
5587
5588         if (exec_context_restrict_namespaces_set(c)) {
5589                 _cleanup_free_ char *s = NULL;
5590
5591                 r = namespace_flags_to_string(c->restrict_namespaces, &s);
5592                 if (r >= 0)
5593                         fprintf(f, "%sRestrictNamespaces: %s\n",
5594                                 prefix, strna(s));
5595         }
5596
5597         if (c->network_namespace_path)
5598                 fprintf(f,
5599                         "%sNetworkNamespacePath: %s\n",
5600                         prefix, c->network_namespace_path);
5601
5602         if (c->syscall_errno > 0) {
5603 #if HAVE_SECCOMP
5604                 const char *errno_name;
5605 #endif
5606
5607                 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
5608
5609 #if HAVE_SECCOMP
5610                 errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
5611                 if (errno_name)
5612                         fputs(errno_name, f);
5613                 else
5614                         fprintf(f, "%d", c->syscall_errno);
5615 #endif
5616                 fputc('\n', f);
5617         }
5618
5619         for (size_t i = 0; i < c->n_mount_images; i++) {
5620                 MountOptions *o;
5621
5622                 fprintf(f, "%sMountImages: %s%s:%s%s", prefix,
5623                         c->mount_images[i].ignore_enoent ? "-": "",
5624                         c->mount_images[i].source,
5625                         c->mount_images[i].destination,
5626                         LIST_IS_EMPTY(c->mount_images[i].mount_options) ? "": ":");
5627                 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
5628                         fprintf(f, "%s:%s",
5629                                 partition_designator_to_string(o->partition_designator),
5630                                 o->options);
5631                 fprintf(f, "\n");
5632         }
5633 }
5634
5635 bool exec_context_maintains_privileges(const ExecContext *c) {
5636         assert(c);
5637
5638         /* Returns true if the process forked off would run under
5639          * an unchanged UID or as root. */
5640
5641         if (!c->user)
5642                 return true;
5643
5644         if (streq(c->user, "root") || streq(c->user, "0"))
5645                 return true;
5646
5647         return false;
5648 }
5649
5650 int exec_context_get_effective_ioprio(const ExecContext *c) {
5651         int p;
5652
5653         assert(c);
5654
5655         if (c->ioprio_set)
5656                 return c->ioprio;
5657
5658         p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
5659         if (p < 0)
5660                 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
5661
5662         return p;
5663 }
5664
5665 bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
5666         assert(c);
5667
5668         /* Explicit setting wins */
5669         if (c->mount_apivfs_set)
5670                 return c->mount_apivfs;
5671
5672         /* Default to "yes" if root directory or image are specified */
5673         if (exec_context_with_rootfs(c))
5674                 return true;
5675
5676         return false;
5677 }
5678
5679 void exec_context_free_log_extra_fields(ExecContext *c) {
5680         assert(c);
5681
5682         for (size_t l = 0; l < c->n_log_extra_fields; l++)
5683                 free(c->log_extra_fields[l].iov_base);
5684         c->log_extra_fields = mfree(c->log_extra_fields);
5685         c->n_log_extra_fields = 0;
5686 }
5687
5688 void exec_context_revert_tty(ExecContext *c) {
5689         int r;
5690
5691         assert(c);
5692
5693         /* First, reset the TTY (possibly kicking everybody else from the TTY) */
5694         exec_context_tty_reset(c, NULL);
5695
5696         /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
5697          * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
5698          * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
5699
5700         if (exec_context_may_touch_tty(c)) {
5701                 const char *path;
5702
5703                 path = exec_context_tty_path(c);
5704                 if (path) {
5705                         r = chmod_and_chown(path, TTY_MODE, 0, TTY_GID);
5706                         if (r < 0 && r != -ENOENT)
5707                                 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
5708                 }
5709         }
5710 }
5711
5712 int exec_context_get_clean_directories(
5713                 ExecContext *c,
5714                 char **prefix,
5715                 ExecCleanMask mask,
5716                 char ***ret) {
5717
5718         _cleanup_strv_free_ char **l = NULL;
5719         int r;
5720
5721         assert(c);
5722         assert(prefix);
5723         assert(ret);
5724
5725         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
5726                 char **i;
5727
5728                 if (!FLAGS_SET(mask, 1U << t))
5729                         continue;
5730
5731                 if (!prefix[t])
5732                         continue;
5733
5734                 STRV_FOREACH(i, c->directories[t].paths) {
5735                         char *j;
5736
5737                         j = path_join(prefix[t], *i);
5738                         if (!j)
5739                                 return -ENOMEM;
5740
5741                         r = strv_consume(&l, j);
5742                         if (r < 0)
5743                                 return r;
5744
5745                         /* Also remove private directories unconditionally. */
5746                         if (t != EXEC_DIRECTORY_CONFIGURATION) {
5747                                 j = path_join(prefix[t], "private", *i);
5748                                 if (!j)
5749                                         return -ENOMEM;
5750
5751                                 r = strv_consume(&l, j);
5752                                 if (r < 0)
5753                                         return r;
5754                         }
5755                 }
5756         }
5757
5758         *ret = TAKE_PTR(l);
5759         return 0;
5760 }
5761
5762 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
5763         ExecCleanMask mask = 0;
5764
5765         assert(c);
5766         assert(ret);
5767
5768         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5769                 if (!strv_isempty(c->directories[t].paths))
5770                         mask |= 1U << t;
5771
5772         *ret = mask;
5773         return 0;
5774 }
5775
5776 void exec_status_start(ExecStatus *s, pid_t pid) {
5777         assert(s);
5778
5779         *s = (ExecStatus) {
5780                 .pid = pid,
5781         };
5782
5783         dual_timestamp_get(&s->start_timestamp);
5784 }
5785
5786 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
5787         assert(s);
5788
5789         if (s->pid != pid)
5790                 *s = (ExecStatus) {
5791                         .pid = pid,
5792                 };
5793
5794         dual_timestamp_get(&s->exit_timestamp);
5795
5796         s->code = code;
5797         s->status = status;
5798
5799         if (context && context->utmp_id)
5800                 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
5801 }
5802
5803 void exec_status_reset(ExecStatus *s) {
5804         assert(s);
5805
5806         *s = (ExecStatus) {};
5807 }
5808
5809 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
5810         char buf[FORMAT_TIMESTAMP_MAX];
5811
5812         assert(s);
5813         assert(f);
5814
5815         if (s->pid <= 0)
5816                 return;
5817
5818         prefix = strempty(prefix);
5819
5820         fprintf(f,
5821                 "%sPID: "PID_FMT"\n",
5822                 prefix, s->pid);
5823
5824         if (dual_timestamp_is_set(&s->start_timestamp))
5825                 fprintf(f,
5826                         "%sStart Timestamp: %s\n",
5827                         prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
5828
5829         if (dual_timestamp_is_set(&s->exit_timestamp))
5830                 fprintf(f,
5831                         "%sExit Timestamp: %s\n"
5832                         "%sExit Code: %s\n"
5833                         "%sExit Status: %i\n",
5834                         prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
5835                         prefix, sigchld_code_to_string(s->code),
5836                         prefix, s->status);
5837 }
5838
5839 static char *exec_command_line(char **argv) {
5840         size_t k;
5841         char *n, *p, **a;
5842         bool first = true;
5843
5844         assert(argv);
5845
5846         k = 1;
5847         STRV_FOREACH(a, argv)
5848                 k += strlen(*a)+3;
5849
5850         n = new(char, k);
5851         if (!n)
5852                 return NULL;
5853
5854         p = n;
5855         STRV_FOREACH(a, argv) {
5856
5857                 if (!first)
5858                         *(p++) = ' ';
5859                 else
5860                         first = false;
5861
5862                 if (strpbrk(*a, WHITESPACE)) {
5863                         *(p++) = '\'';
5864                         p = stpcpy(p, *a);
5865                         *(p++) = '\'';
5866                 } else
5867                         p = stpcpy(p, *a);
5868
5869         }
5870
5871         *p = 0;
5872
5873         /* FIXME: this doesn't really handle arguments that have
5874          * spaces and ticks in them */
5875
5876         return n;
5877 }
5878
5879 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
5880         _cleanup_free_ char *cmd = NULL;
5881         const char *prefix2;
5882
5883         assert(c);
5884         assert(f);
5885
5886         prefix = strempty(prefix);
5887         prefix2 = strjoina(prefix, "\t");
5888
5889         cmd = exec_command_line(c->argv);
5890         fprintf(f,
5891                 "%sCommand Line: %s\n",
5892                 prefix, cmd ? cmd : strerror_safe(ENOMEM));
5893
5894         exec_status_dump(&c->exec_status, f, prefix2);
5895 }
5896
5897 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
5898         assert(f);
5899
5900         prefix = strempty(prefix);
5901
5902         LIST_FOREACH(command, c, c)
5903                 exec_command_dump(c, f, prefix);
5904 }
5905
5906 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
5907         ExecCommand *end;
5908
5909         assert(l);
5910         assert(e);
5911
5912         if (*l) {
5913                 /* It's kind of important, that we keep the order here */
5914                 LIST_FIND_TAIL(command, *l, end);
5915                 LIST_INSERT_AFTER(command, *l, end, e);
5916         } else
5917               *l = e;
5918 }
5919
5920 int exec_command_set(ExecCommand *c, const char *path, ...) {
5921         va_list ap;
5922         char **l, *p;
5923
5924         assert(c);
5925         assert(path);
5926
5927         va_start(ap, path);
5928         l = strv_new_ap(path, ap);
5929         va_end(ap);
5930
5931         if (!l)
5932                 return -ENOMEM;
5933
5934         p = strdup(path);
5935         if (!p) {
5936                 strv_free(l);
5937                 return -ENOMEM;
5938         }
5939
5940         free_and_replace(c->path, p);
5941
5942         return strv_free_and_replace(c->argv, l);
5943 }
5944
5945 int exec_command_append(ExecCommand *c, const char *path, ...) {
5946         _cleanup_strv_free_ char **l = NULL;
5947         va_list ap;
5948         int r;
5949
5950         assert(c);
5951         assert(path);
5952
5953         va_start(ap, path);
5954         l = strv_new_ap(path, ap);
5955         va_end(ap);
5956
5957         if (!l)
5958                 return -ENOMEM;
5959
5960         r = strv_extend_strv(&c->argv, l, false);
5961         if (r < 0)
5962                 return r;
5963
5964         return 0;
5965 }
5966
5967 static void *remove_tmpdir_thread(void *p) {
5968         _cleanup_free_ char *path = p;
5969
5970         (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
5971         return NULL;
5972 }
5973
5974 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
5975         int r;
5976
5977         if (!rt)
5978                 return NULL;
5979
5980         if (rt->manager)
5981                 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
5982
5983         /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
5984
5985         if (destroy && rt->tmp_dir && !streq(rt->tmp_dir, RUN_SYSTEMD_EMPTY)) {
5986                 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
5987
5988                 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
5989                 if (r < 0)
5990                         log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
5991                 else
5992                         rt->tmp_dir = NULL;
5993         }
5994
5995         if (destroy && rt->var_tmp_dir && !streq(rt->var_tmp_dir, RUN_SYSTEMD_EMPTY)) {
5996                 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
5997
5998                 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
5999                 if (r < 0)
6000                         log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
6001                 else
6002                         rt->var_tmp_dir = NULL;
6003         }
6004
6005         rt->id = mfree(rt->id);
6006         rt->tmp_dir = mfree(rt->tmp_dir);
6007         rt->var_tmp_dir = mfree(rt->var_tmp_dir);
6008         safe_close_pair(rt->netns_storage_socket);
6009         return mfree(rt);
6010 }
6011
6012 static void exec_runtime_freep(ExecRuntime **rt) {
6013         (void) exec_runtime_free(*rt, false);
6014 }
6015
6016 static int exec_runtime_allocate(ExecRuntime **ret, const char *id) {
6017         _cleanup_free_ char *id_copy = NULL;
6018         ExecRuntime *n;
6019
6020         assert(ret);
6021
6022         id_copy = strdup(id);
6023         if (!id_copy)
6024                 return -ENOMEM;
6025
6026         n = new(ExecRuntime, 1);
6027         if (!n)
6028                 return -ENOMEM;
6029
6030         *n = (ExecRuntime) {
6031                 .id = TAKE_PTR(id_copy),
6032                 .netns_storage_socket = { -1, -1 },
6033         };
6034
6035         *ret = n;
6036         return 0;
6037 }
6038
6039 static int exec_runtime_add(
6040                 Manager *m,
6041                 const char *id,
6042                 char **tmp_dir,
6043                 char **var_tmp_dir,
6044                 int netns_storage_socket[2],
6045                 ExecRuntime **ret) {
6046
6047         _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
6048         int r;
6049
6050         assert(m);
6051         assert(id);
6052
6053         /* tmp_dir, var_tmp_dir, netns_storage_socket fds are donated on success */
6054
6055         r = hashmap_ensure_allocated(&m->exec_runtime_by_id, &string_hash_ops);
6056         if (r < 0)
6057                 return r;
6058
6059         r = exec_runtime_allocate(&rt, id);
6060         if (r < 0)
6061                 return r;
6062
6063         r = hashmap_put(m->exec_runtime_by_id, rt->id, rt);
6064         if (r < 0)
6065                 return r;
6066
6067         assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
6068         rt->tmp_dir = TAKE_PTR(*tmp_dir);
6069         rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
6070
6071         if (netns_storage_socket) {
6072                 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
6073                 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
6074         }
6075
6076         rt->manager = m;
6077
6078         if (ret)
6079                 *ret = rt;
6080         /* do not remove created ExecRuntime object when the operation succeeds. */
6081         TAKE_PTR(rt);
6082         return 0;
6083 }
6084
6085 static int exec_runtime_make(
6086                 Manager *m,
6087                 const ExecContext *c,
6088                 const char *id,
6089                 ExecRuntime **ret) {
6090
6091         _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
6092         _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 };
6093         int r;
6094
6095         assert(m);
6096         assert(c);
6097         assert(id);
6098
6099         /* It is not necessary to create ExecRuntime object. */
6100         if (!c->private_network && !c->private_tmp && !c->network_namespace_path) {
6101                 *ret = NULL;
6102                 return 0;
6103         }
6104
6105         if (c->private_tmp &&
6106             !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
6107               (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
6108                prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
6109                 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
6110                 if (r < 0)
6111                         return r;
6112         }
6113
6114         if (c->private_network || c->network_namespace_path) {
6115                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
6116                         return -errno;
6117         }
6118
6119         r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ret);
6120         if (r < 0)
6121                 return r;
6122
6123         return 1;
6124 }
6125
6126 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
6127         ExecRuntime *rt;
6128         int r;
6129
6130         assert(m);
6131         assert(id);
6132         assert(ret);
6133
6134         rt = hashmap_get(m->exec_runtime_by_id, id);
6135         if (rt)
6136                 /* We already have a ExecRuntime object, let's increase the ref count and reuse it */
6137                 goto ref;
6138
6139         if (!create) {
6140                 *ret = NULL;
6141                 return 0;
6142         }
6143
6144         /* If not found, then create a new object. */
6145         r = exec_runtime_make(m, c, id, &rt);
6146         if (r < 0)
6147                 return r;
6148         if (r == 0) {
6149                 /* When r == 0, it is not necessary to create ExecRuntime object. */
6150                 *ret = NULL;
6151                 return 0;
6152         }
6153
6154 ref:
6155         /* increment reference counter. */
6156         rt->n_ref++;
6157         *ret = rt;
6158         return 1;
6159 }
6160
6161 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
6162         if (!rt)
6163                 return NULL;
6164
6165         assert(rt->n_ref > 0);
6166
6167         rt->n_ref--;
6168         if (rt->n_ref > 0)
6169                 return NULL;
6170
6171         return exec_runtime_free(rt, destroy);
6172 }
6173
6174 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
6175         ExecRuntime *rt;
6176
6177         assert(m);
6178         assert(f);
6179         assert(fds);
6180
6181         HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
6182                 fprintf(f, "exec-runtime=%s", rt->id);
6183
6184                 if (rt->tmp_dir)
6185                         fprintf(f, " tmp-dir=%s", rt->tmp_dir);
6186
6187                 if (rt->var_tmp_dir)
6188                         fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
6189
6190                 if (rt->netns_storage_socket[0] >= 0) {
6191                         int copy;
6192
6193                         copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
6194                         if (copy < 0)
6195                                 return copy;
6196
6197                         fprintf(f, " netns-socket-0=%i", copy);
6198                 }
6199
6200                 if (rt->netns_storage_socket[1] >= 0) {
6201                         int copy;
6202
6203                         copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
6204                         if (copy < 0)
6205                                 return copy;
6206
6207                         fprintf(f, " netns-socket-1=%i", copy);
6208                 }
6209
6210                 fputc('\n', f);
6211         }
6212
6213         return 0;
6214 }
6215
6216 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
6217         _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
6218         ExecRuntime *rt;
6219         int r;
6220
6221         /* This is for the migration from old (v237 or earlier) deserialization text.
6222          * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
6223          * Even if the ExecRuntime object originally created by the other unit, we cannot judge
6224          * so or not from the serialized text, then we always creates a new object owned by this. */
6225
6226         assert(u);
6227         assert(key);
6228         assert(value);
6229
6230         /* Manager manages ExecRuntime objects by the unit id.
6231          * So, we omit the serialized text when the unit does not have id (yet?)... */
6232         if (isempty(u->id)) {
6233                 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
6234                 return 0;
6235         }
6236
6237         r = hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops);
6238         if (r < 0) {
6239                 log_unit_debug_errno(u, r, "Failed to allocate storage for runtime parameter: %m");
6240                 return 0;
6241         }
6242
6243         rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
6244         if (!rt) {
6245                 r = exec_runtime_allocate(&rt_create, u->id);
6246                 if (r < 0)
6247                         return log_oom();
6248
6249                 rt = rt_create;
6250         }
6251
6252         if (streq(key, "tmp-dir")) {
6253                 char *copy;
6254
6255                 copy = strdup(value);
6256                 if (!copy)
6257                         return log_oom();
6258
6259                 free_and_replace(rt->tmp_dir, copy);
6260
6261         } else if (streq(key, "var-tmp-dir")) {
6262                 char *copy;
6263
6264                 copy = strdup(value);
6265                 if (!copy)
6266                         return log_oom();
6267
6268                 free_and_replace(rt->var_tmp_dir, copy);
6269
6270         } else if (streq(key, "netns-socket-0")) {
6271                 int fd;
6272
6273                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
6274                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
6275                         return 0;
6276                 }
6277
6278                 safe_close(rt->netns_storage_socket[0]);
6279                 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
6280
6281         } else if (streq(key, "netns-socket-1")) {
6282                 int fd;
6283
6284                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
6285                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
6286                         return 0;
6287                 }
6288
6289                 safe_close(rt->netns_storage_socket[1]);
6290                 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
6291         } else
6292                 return 0;
6293
6294         /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
6295         if (rt_create) {
6296                 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
6297                 if (r < 0) {
6298                         log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
6299                         return 0;
6300                 }
6301
6302                 rt_create->manager = u->manager;
6303
6304                 /* Avoid cleanup */
6305                 TAKE_PTR(rt_create);
6306         }
6307
6308         return 1;
6309 }
6310
6311 int exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
6312         _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
6313         char *id = NULL;
6314         int r, fdpair[] = {-1, -1};
6315         const char *p, *v = value;
6316         size_t n;
6317
6318         assert(m);
6319         assert(value);
6320         assert(fds);
6321
6322         n = strcspn(v, " ");
6323         id = strndupa(v, n);
6324         if (v[n] != ' ')
6325                 goto finalize;
6326         p = v + n + 1;
6327
6328         v = startswith(p, "tmp-dir=");
6329         if (v) {
6330                 n = strcspn(v, " ");
6331                 tmp_dir = strndup(v, n);
6332                 if (!tmp_dir)
6333                         return log_oom();
6334                 if (v[n] != ' ')
6335                         goto finalize;
6336                 p = v + n + 1;
6337         }
6338
6339         v = startswith(p, "var-tmp-dir=");
6340         if (v) {
6341                 n = strcspn(v, " ");
6342                 var_tmp_dir = strndup(v, n);
6343                 if (!var_tmp_dir)
6344                         return log_oom();
6345                 if (v[n] != ' ')
6346                         goto finalize;
6347                 p = v + n + 1;
6348         }
6349
6350         v = startswith(p, "netns-socket-0=");
6351         if (v) {
6352                 char *buf;
6353
6354                 n = strcspn(v, " ");
6355                 buf = strndupa(v, n);
6356
6357                 r = safe_atoi(buf, &fdpair[0]);
6358                 if (r < 0)
6359                         return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
6360                 if (!fdset_contains(fds, fdpair[0]))
6361                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6362                                                "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", fdpair[0]);
6363                 fdpair[0] = fdset_remove(fds, fdpair[0]);
6364                 if (v[n] != ' ')
6365                         goto finalize;
6366                 p = v + n + 1;
6367         }
6368
6369         v = startswith(p, "netns-socket-1=");
6370         if (v) {
6371                 char *buf;
6372
6373                 n = strcspn(v, " ");
6374                 buf = strndupa(v, n);
6375                 r = safe_atoi(buf, &fdpair[1]);
6376                 if (r < 0)
6377                         return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
6378                 if (!fdset_contains(fds, fdpair[1]))
6379                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6380                                                "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", fdpair[1]);
6381                 fdpair[1] = fdset_remove(fds, fdpair[1]);
6382         }
6383
6384 finalize:
6385         r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, fdpair, NULL);
6386         if (r < 0)
6387                 return log_debug_errno(r, "Failed to add exec-runtime: %m");
6388         return 0;
6389 }
6390
6391 void exec_runtime_vacuum(Manager *m) {
6392         ExecRuntime *rt;
6393
6394         assert(m);
6395
6396         /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
6397
6398         HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
6399                 if (rt->n_ref > 0)
6400                         continue;
6401
6402                 (void) exec_runtime_free(rt, false);
6403         }
6404 }
6405
6406 void exec_params_clear(ExecParameters *p) {
6407         if (!p)
6408                 return;
6409
6410         p->environment = strv_free(p->environment);
6411         p->fd_names = strv_free(p->fd_names);
6412         p->fds = mfree(p->fds);
6413         p->exec_fd = safe_close(p->exec_fd);
6414 }
6415
6416 ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) {
6417         if (!sc)
6418                 return NULL;
6419
6420         free(sc->id);
6421         free(sc->data);
6422         return mfree(sc);
6423 }
6424
6425 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_set_credential_hash_ops, char, string_hash_func, string_compare_func, ExecSetCredential, exec_set_credential_free);
6426
6427 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
6428         [EXEC_INPUT_NULL] = "null",
6429         [EXEC_INPUT_TTY] = "tty",
6430         [EXEC_INPUT_TTY_FORCE] = "tty-force",
6431         [EXEC_INPUT_TTY_FAIL] = "tty-fail",
6432         [EXEC_INPUT_SOCKET] = "socket",
6433         [EXEC_INPUT_NAMED_FD] = "fd",
6434         [EXEC_INPUT_DATA] = "data",
6435         [EXEC_INPUT_FILE] = "file",
6436 };
6437
6438 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
6439
6440 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
6441         [EXEC_OUTPUT_INHERIT] = "inherit",
6442         [EXEC_OUTPUT_NULL] = "null",
6443         [EXEC_OUTPUT_TTY] = "tty",
6444         [EXEC_OUTPUT_KMSG] = "kmsg",
6445         [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
6446         [EXEC_OUTPUT_JOURNAL] = "journal",
6447         [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
6448         [EXEC_OUTPUT_SOCKET] = "socket",
6449         [EXEC_OUTPUT_NAMED_FD] = "fd",
6450         [EXEC_OUTPUT_FILE] = "file",
6451         [EXEC_OUTPUT_FILE_APPEND] = "append",
6452 };
6453
6454 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
6455
6456 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
6457         [EXEC_UTMP_INIT] = "init",
6458         [EXEC_UTMP_LOGIN] = "login",
6459         [EXEC_UTMP_USER] = "user",
6460 };
6461
6462 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
6463
6464 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
6465         [EXEC_PRESERVE_NO] = "no",
6466         [EXEC_PRESERVE_YES] = "yes",
6467         [EXEC_PRESERVE_RESTART] = "restart",
6468 };
6469
6470 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
6471
6472 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
6473 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
6474         [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
6475         [EXEC_DIRECTORY_STATE] = "StateDirectory",
6476         [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
6477         [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
6478         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
6479 };
6480
6481 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
6482
6483 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
6484  * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
6485  * directories, specifically .timer units with their timestamp touch file. */
6486 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
6487         [EXEC_DIRECTORY_RUNTIME] = "runtime",
6488         [EXEC_DIRECTORY_STATE] = "state",
6489         [EXEC_DIRECTORY_CACHE] = "cache",
6490         [EXEC_DIRECTORY_LOGS] = "logs",
6491         [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
6492 };
6493
6494 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
6495
6496 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
6497  * the service payload in. */
6498 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
6499         [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
6500         [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
6501         [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
6502         [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
6503         [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
6504 };
6505
6506 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
6507
6508 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
6509         [EXEC_KEYRING_INHERIT] = "inherit",
6510         [EXEC_KEYRING_PRIVATE] = "private",
6511         [EXEC_KEYRING_SHARED] = "shared",
6512 };
6513
6514 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);