src/core/execute.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <errno.h>
   4 #include <fcntl.h>
   5 #include <poll.h>
   6 #include <sys/eventfd.h>
   7 #include <sys/ioctl.h>
   8 #include <sys/mman.h>
   9 #include <sys/mount.h>
  10 #include <sys/personality.h>
  11 #include <sys/prctl.h>
  12 #include <sys/shm.h>
  13 #include <sys/types.h>
  14 #include <sys/un.h>
  15 #include <unistd.h>
  16 #include <utmpx.h>
  17
  18 #if HAVE_PAM
  19 #include <security/pam_appl.h>
  20 #endif
  21
  22 #if HAVE_SELINUX
  23 #include <selinux/selinux.h>
  24 #endif
  25
  26 #if HAVE_SECCOMP
  27 #include <seccomp.h>
  28 #endif
  29
  30 #if HAVE_APPARMOR
  31 #include <sys/apparmor.h>
  32 #endif
  33
  34 #include "sd-messages.h"
  35
  36 #include "acl-util.h"
  37 #include "af-list.h"
  38 #include "alloc-util.h"
  39 #if HAVE_APPARMOR
  40 #include "apparmor-util.h"
  41 #endif
  42 #include "argv-util.h"
  43 #include "async.h"
  44 #include "barrier.h"
  45 #include "bpf-lsm.h"
  46 #include "cap-list.h"
  47 #include "capability-util.h"
  48 #include "cgroup-setup.h"
  49 #include "chase-symlinks.h"
  50 #include "chown-recursive.h"
  51 #include "constants.h"
  52 #include "cpu-set-util.h"
  53 #include "creds-util.h"
  54 #include "data-fd-util.h"
  55 #include "env-file.h"
  56 #include "env-util.h"
  57 #include "errno-list.h"
  58 #include "escape.h"
  59 #include "execute.h"
  60 #include "exit-status.h"
  61 #include "fd-util.h"
  62 #include "fileio.h"
  63 #include "format-util.h"
  64 #include "glob-util.h"
  65 #include "hexdecoct.h"
  66 #include "io-util.h"
  67 #include "ioprio-util.h"
  68 #include "label.h"
  69 #include "log.h"
  70 #include "macro.h"
  71 #include "manager.h"
  72 #include "manager-dump.h"
  73 #include "memory-util.h"
  74 #include "missing_fs.h"
  75 #include "missing_ioprio.h"
  76 #include "mkdir-label.h"
  77 #include "mount-util.h"
  78 #include "mountpoint-util.h"
  79 #include "namespace.h"
  80 #include "parse-util.h"
  81 #include "path-util.h"
  82 #include "process-util.h"
  83 #include "random-util.h"
  84 #include "recurse-dir.h"
  85 #include "rlimit-util.h"
  86 #include "rm-rf.h"
  87 #if HAVE_SECCOMP
  88 #include "seccomp-util.h"
  89 #endif
  90 #include "securebits-util.h"
  91 #include "selinux-util.h"
  92 #include "signal-util.h"
  93 #include "smack-util.h"
  94 #include "socket-util.h"
  95 #include "sort-util.h"
  96 #include "special.h"
  97 #include "stat-util.h"
  98 #include "string-table.h"
  99 #include "string-util.h"
 100 #include "strv.h"
 101 #include "syslog-util.h"
 102 #include "terminal-util.h"
 103 #include "tmpfile-util.h"
 104 #include "umask-util.h"
 105 #include "unit-serialize.h"
 106 #include "user-util.h"
 107 #include "utmp-wtmp.h"
 108
 109 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
 110 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
 111
 112 #define SNDBUF_SIZE (8*1024*1024)
 113
 114 static int shift_fds(int fds[], size_t n_fds) {
 115         if (n_fds <= 0)
 116                 return 0;
 117
 118         /* Modifies the fds array! (sorts it) */
 119
 120         assert(fds);
 121
 122         for (int start = 0;;) {
 123                 int restart_from = -1;
 124
 125                 for (int i = start; i < (int) n_fds; i++) {
 126                         int nfd;
 127
 128                         /* Already at right index? */
 129                         if (fds[i] == i+3)
 130                                 continue;
 131
 132                         nfd = fcntl(fds[i], F_DUPFD, i + 3);
 133                         if (nfd < 0)
 134                                 return -errno;
 135
 136                         safe_close(fds[i]);
 137                         fds[i] = nfd;
 138
 139                         /* Hmm, the fd we wanted isn't free? Then
 140                          * let's remember that and try again from here */
 141                         if (nfd != i+3 && restart_from < 0)
 142                                 restart_from = i;
 143                 }
 144
 145                 if (restart_from < 0)
 146                         break;
 147
 148                 start = restart_from;
 149         }
 150
 151         return 0;
 152 }
 153
 154 static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
 155         size_t n_fds;
 156         int r;
 157
 158         n_fds = n_socket_fds + n_storage_fds;
 159         if (n_fds <= 0)
 160                 return 0;
 161
 162         assert(fds);
 163
 164         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
 165          * O_NONBLOCK only applies to socket activation though. */
 166
 167         for (size_t i = 0; i < n_fds; i++) {
 168
 169                 if (i < n_socket_fds) {
 170                         r = fd_nonblock(fds[i], nonblock);
 171                         if (r < 0)
 172                                 return r;
 173                 }
 174
 175                 /* We unconditionally drop FD_CLOEXEC from the fds,
 176                  * since after all we want to pass these fds to our
 177                  * children */
 178
 179                 r = fd_cloexec(fds[i], false);
 180                 if (r < 0)
 181                         return r;
 182         }
 183
 184         return 0;
 185 }
 186
 187 static const char *exec_context_tty_path(const ExecContext *context) {
 188         assert(context);
 189
 190         if (context->stdio_as_fds)
 191                 return NULL;
 192
 193         if (context->tty_path)
 194                 return context->tty_path;
 195
 196         return "/dev/console";
 197 }
 198
 199 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
 200         const char *path;
 201
 202         assert(context);
 203
 204         path = exec_context_tty_path(context);
 205
 206         if (context->tty_vhangup) {
 207                 if (p && p->stdin_fd >= 0)
 208                         (void) terminal_vhangup_fd(p->stdin_fd);
 209                 else if (path)
 210                         (void) terminal_vhangup(path);
 211         }
 212
 213         if (context->tty_reset) {
 214                 if (p && p->stdin_fd >= 0)
 215                         (void) reset_terminal_fd(p->stdin_fd, true);
 216                 else if (path)
 217                         (void) reset_terminal(path);
 218         }
 219
 220         if (p && p->stdin_fd >= 0)
 221                 (void) terminal_set_size_fd(p->stdin_fd, path, context->tty_rows, context->tty_cols);
 222
 223         if (context->tty_vt_disallocate && path)
 224                 (void) vt_disallocate(path);
 225 }
 226
 227 static bool is_terminal_input(ExecInput i) {
 228         return IN_SET(i,
 229                       EXEC_INPUT_TTY,
 230                       EXEC_INPUT_TTY_FORCE,
 231                       EXEC_INPUT_TTY_FAIL);
 232 }
 233
 234 static bool is_terminal_output(ExecOutput o) {
 235         return IN_SET(o,
 236                       EXEC_OUTPUT_TTY,
 237                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 238                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 239 }
 240
 241 static bool is_kmsg_output(ExecOutput o) {
 242         return IN_SET(o,
 243                       EXEC_OUTPUT_KMSG,
 244                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 245 }
 246
 247 static bool exec_context_needs_term(const ExecContext *c) {
 248         assert(c);
 249
 250         /* Return true if the execution context suggests we should set $TERM to something useful. */
 251
 252         if (is_terminal_input(c->std_input))
 253                 return true;
 254
 255         if (is_terminal_output(c->std_output))
 256                 return true;
 257
 258         if (is_terminal_output(c->std_error))
 259                 return true;
 260
 261         return !!c->tty_path;
 262 }
 263
 264 static int open_null_as(int flags, int nfd) {
 265         int fd;
 266
 267         assert(nfd >= 0);
 268
 269         fd = open("/dev/null", flags|O_NOCTTY);
 270         if (fd < 0)
 271                 return -errno;
 272
 273         return move_fd(fd, nfd, false);
 274 }
 275
 276 static int connect_journal_socket(
 277                 int fd,
 278                 const char *log_namespace,
 279                 uid_t uid,
 280                 gid_t gid) {
 281
 282         uid_t olduid = UID_INVALID;
 283         gid_t oldgid = GID_INVALID;
 284         const char *j;
 285         int r;
 286
 287         j = log_namespace ?
 288                 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
 289                 "/run/systemd/journal/stdout";
 290
 291         if (gid_is_valid(gid)) {
 292                 oldgid = getgid();
 293
 294                 if (setegid(gid) < 0)
 295                         return -errno;
 296         }
 297
 298         if (uid_is_valid(uid)) {
 299                 olduid = getuid();
 300
 301                 if (seteuid(uid) < 0) {
 302                         r = -errno;
 303                         goto restore_gid;
 304                 }
 305         }
 306
 307         r = connect_unix_path(fd, AT_FDCWD, j);
 308
 309         /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
 310            an LSM interferes. */
 311
 312         if (uid_is_valid(uid))
 313                 (void) seteuid(olduid);
 314
 315  restore_gid:
 316         if (gid_is_valid(gid))
 317                 (void) setegid(oldgid);
 318
 319         return r;
 320 }
 321
 322 static int connect_logger_as(
 323                 const Unit *unit,
 324                 const ExecContext *context,
 325                 const ExecParameters *params,
 326                 ExecOutput output,
 327                 const char *ident,
 328                 int nfd,
 329                 uid_t uid,
 330                 gid_t gid) {
 331
 332         _cleanup_close_ int fd = -EBADF;
 333         int r;
 334
 335         assert(context);
 336         assert(params);
 337         assert(output < _EXEC_OUTPUT_MAX);
 338         assert(ident);
 339         assert(nfd >= 0);
 340
 341         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 342         if (fd < 0)
 343                 return -errno;
 344
 345         r = connect_journal_socket(fd, context->log_namespace, uid, gid);
 346         if (r < 0)
 347                 return r;
 348
 349         if (shutdown(fd, SHUT_RD) < 0)
 350                 return -errno;
 351
 352         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 353
 354         if (dprintf(fd,
 355                 "%s\n"
 356                 "%s\n"
 357                 "%i\n"
 358                 "%i\n"
 359                 "%i\n"
 360                 "%i\n"
 361                 "%i\n",
 362                 context->syslog_identifier ?: ident,
 363                 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
 364                 context->syslog_priority,
 365                 !!context->syslog_level_prefix,
 366                 false,
 367                 is_kmsg_output(output),
 368                 is_terminal_output(output)) < 0)
 369                 return -errno;
 370
 371         return move_fd(TAKE_FD(fd), nfd, false);
 372 }
 373
 374 static int open_terminal_as(const char *path, int flags, int nfd) {
 375         int fd;
 376
 377         assert(path);
 378         assert(nfd >= 0);
 379
 380         fd = open_terminal(path, flags | O_NOCTTY);
 381         if (fd < 0)
 382                 return fd;
 383
 384         return move_fd(fd, nfd, false);
 385 }
 386
 387 static int acquire_path(const char *path, int flags, mode_t mode) {
 388         _cleanup_close_ int fd = -EBADF;
 389         int r;
 390
 391         assert(path);
 392
 393         if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
 394                 flags |= O_CREAT;
 395
 396         fd = open(path, flags|O_NOCTTY, mode);
 397         if (fd >= 0)
 398                 return TAKE_FD(fd);
 399
 400         if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
 401                 return -errno;
 402
 403         /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
 404
 405         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 406         if (fd < 0)
 407                 return -errno;
 408
 409         r = connect_unix_path(fd, AT_FDCWD, path);
 410         if (IN_SET(r, -ENOTSOCK, -EINVAL))
 411                 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
 412                  * wasn't an AF_UNIX socket after all */
 413                 return -ENXIO;
 414         if (r < 0)
 415                 return r;
 416
 417         if ((flags & O_ACCMODE) == O_RDONLY)
 418                 r = shutdown(fd, SHUT_WR);
 419         else if ((flags & O_ACCMODE) == O_WRONLY)
 420                 r = shutdown(fd, SHUT_RD);
 421         else
 422                 r = 0;
 423         if (r < 0)
 424                 return -errno;
 425
 426         return TAKE_FD(fd);
 427 }
 428
 429 static int fixup_input(
 430                 const ExecContext *context,
 431                 int socket_fd,
 432                 bool apply_tty_stdin) {
 433
 434         ExecInput std_input;
 435
 436         assert(context);
 437
 438         std_input = context->std_input;
 439
 440         if (is_terminal_input(std_input) && !apply_tty_stdin)
 441                 return EXEC_INPUT_NULL;
 442
 443         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 444                 return EXEC_INPUT_NULL;
 445
 446         if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
 447                 return EXEC_INPUT_NULL;
 448
 449         return std_input;
 450 }
 451
 452 static int fixup_output(ExecOutput output, int socket_fd) {
 453
 454         if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 455                 return EXEC_OUTPUT_INHERIT;
 456
 457         return output;
 458 }
 459
 460 static int setup_input(
 461                 const ExecContext *context,
 462                 const ExecParameters *params,
 463                 int socket_fd,
 464                 const int named_iofds[static 3]) {
 465
 466         ExecInput i;
 467         int r;
 468
 469         assert(context);
 470         assert(params);
 471         assert(named_iofds);
 472
 473         if (params->stdin_fd >= 0) {
 474                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 475                         return -errno;
 476
 477                 /* Try to make this the controlling tty, if it is a tty, and reset it */
 478                 if (isatty(STDIN_FILENO)) {
 479                         (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 480                         (void) reset_terminal_fd(STDIN_FILENO, true);
 481                         (void) terminal_set_size_fd(STDIN_FILENO, NULL, context->tty_rows, context->tty_cols);
 482                 }
 483
 484                 return STDIN_FILENO;
 485         }
 486
 487         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 488
 489         switch (i) {
 490
 491         case EXEC_INPUT_NULL:
 492                 return open_null_as(O_RDONLY, STDIN_FILENO);
 493
 494         case EXEC_INPUT_TTY:
 495         case EXEC_INPUT_TTY_FORCE:
 496         case EXEC_INPUT_TTY_FAIL: {
 497                 int fd;
 498
 499                 fd = acquire_terminal(exec_context_tty_path(context),
 500                                       i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
 501                                       i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
 502                                                                   ACQUIRE_TERMINAL_WAIT,
 503                                       USEC_INFINITY);
 504                 if (fd < 0)
 505                         return fd;
 506
 507                 r = terminal_set_size_fd(fd, exec_context_tty_path(context), context->tty_rows, context->tty_cols);
 508                 if (r < 0)
 509                         return r;
 510
 511                 return move_fd(fd, STDIN_FILENO, false);
 512         }
 513
 514         case EXEC_INPUT_SOCKET:
 515                 assert(socket_fd >= 0);
 516
 517                 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
 518
 519         case EXEC_INPUT_NAMED_FD:
 520                 assert(named_iofds[STDIN_FILENO] >= 0);
 521
 522                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 523                 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
 524
 525         case EXEC_INPUT_DATA: {
 526                 int fd;
 527
 528                 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
 529                 if (fd < 0)
 530                         return fd;
 531
 532                 return move_fd(fd, STDIN_FILENO, false);
 533         }
 534
 535         case EXEC_INPUT_FILE: {
 536                 bool rw;
 537                 int fd;
 538
 539                 assert(context->stdio_file[STDIN_FILENO]);
 540
 541                 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
 542                         (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
 543
 544                 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
 545                 if (fd < 0)
 546                         return fd;
 547
 548                 return move_fd(fd, STDIN_FILENO, false);
 549         }
 550
 551         default:
 552                 assert_not_reached();
 553         }
 554 }
 555
 556 static bool can_inherit_stderr_from_stdout(
 557                 const ExecContext *context,
 558                 ExecOutput o,
 559                 ExecOutput e) {
 560
 561         assert(context);
 562
 563         /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
 564          * stderr fd */
 565
 566         if (e == EXEC_OUTPUT_INHERIT)
 567                 return true;
 568         if (e != o)
 569                 return false;
 570
 571         if (e == EXEC_OUTPUT_NAMED_FD)
 572                 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
 573
 574         if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
 575                 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
 576
 577         return true;
 578 }
 579
 580 static int setup_output(
 581                 const Unit *unit,
 582                 const ExecContext *context,
 583                 const ExecParameters *params,
 584                 int fileno,
 585                 int socket_fd,
 586                 const int named_iofds[static 3],
 587                 const char *ident,
 588                 uid_t uid,
 589                 gid_t gid,
 590                 dev_t *journal_stream_dev,
 591                 ino_t *journal_stream_ino) {
 592
 593         ExecOutput o;
 594         ExecInput i;
 595         int r;
 596
 597         assert(unit);
 598         assert(context);
 599         assert(params);
 600         assert(ident);
 601         assert(journal_stream_dev);
 602         assert(journal_stream_ino);
 603
 604         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 605
 606                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 607                         return -errno;
 608
 609                 return STDOUT_FILENO;
 610         }
 611
 612         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 613                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 614                         return -errno;
 615
 616                 return STDERR_FILENO;
 617         }
 618
 619         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 620         o = fixup_output(context->std_output, socket_fd);
 621
 622         if (fileno == STDERR_FILENO) {
 623                 ExecOutput e;
 624                 e = fixup_output(context->std_error, socket_fd);
 625
 626                 /* This expects the input and output are already set up */
 627
 628                 /* Don't change the stderr file descriptor if we inherit all
 629                  * the way and are not on a tty */
 630                 if (e == EXEC_OUTPUT_INHERIT &&
 631                     o == EXEC_OUTPUT_INHERIT &&
 632                     i == EXEC_INPUT_NULL &&
 633                     !is_terminal_input(context->std_input) &&
 634                     getppid() != 1)
 635                         return fileno;
 636
 637                 /* Duplicate from stdout if possible */
 638                 if (can_inherit_stderr_from_stdout(context, o, e))
 639                         return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
 640
 641                 o = e;
 642
 643         } else if (o == EXEC_OUTPUT_INHERIT) {
 644                 /* If input got downgraded, inherit the original value */
 645                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 646                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 647
 648                 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
 649                 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
 650                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 651
 652                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 653                 if (getppid() != 1)
 654                         return fileno;
 655
 656                 /* We need to open /dev/null here anew, to get the right access mode. */
 657                 return open_null_as(O_WRONLY, fileno);
 658         }
 659
 660         switch (o) {
 661
 662         case EXEC_OUTPUT_NULL:
 663                 return open_null_as(O_WRONLY, fileno);
 664
 665         case EXEC_OUTPUT_TTY:
 666                 if (is_terminal_input(i))
 667                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 668
 669                 /* We don't reset the terminal if this is just about output */
 670                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 671
 672         case EXEC_OUTPUT_KMSG:
 673         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 674         case EXEC_OUTPUT_JOURNAL:
 675         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 676                 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
 677                 if (r < 0) {
 678                         log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m",
 679                                                fileno == STDOUT_FILENO ? "stdout" : "stderr");
 680                         r = open_null_as(O_WRONLY, fileno);
 681                 } else {
 682                         struct stat st;
 683
 684                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 685                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 686                          * services to detect whether they are connected to the journal or not.
 687                          *
 688                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
 689                          * about STDERR as that's usually the best way to do logging. */
 690
 691                         if (fstat(fileno, &st) >= 0 &&
 692                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
 693                                 *journal_stream_dev = st.st_dev;
 694                                 *journal_stream_ino = st.st_ino;
 695                         }
 696                 }
 697                 return r;
 698
 699         case EXEC_OUTPUT_SOCKET:
 700                 assert(socket_fd >= 0);
 701
 702                 return RET_NERRNO(dup2(socket_fd, fileno));
 703
 704         case EXEC_OUTPUT_NAMED_FD:
 705                 assert(named_iofds[fileno] >= 0);
 706
 707                 (void) fd_nonblock(named_iofds[fileno], false);
 708                 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
 709
 710         case EXEC_OUTPUT_FILE:
 711         case EXEC_OUTPUT_FILE_APPEND:
 712         case EXEC_OUTPUT_FILE_TRUNCATE: {
 713                 bool rw;
 714                 int fd, flags;
 715
 716                 assert(context->stdio_file[fileno]);
 717
 718                 rw = context->std_input == EXEC_INPUT_FILE &&
 719                         streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
 720
 721                 if (rw)
 722                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 723
 724                 flags = O_WRONLY;
 725                 if (o == EXEC_OUTPUT_FILE_APPEND)
 726                         flags |= O_APPEND;
 727                 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
 728                         flags |= O_TRUNC;
 729
 730                 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
 731                 if (fd < 0)
 732                         return fd;
 733
 734                 return move_fd(fd, fileno, 0);
 735         }
 736
 737         default:
 738                 assert_not_reached();
 739         }
 740 }
 741
 742 static int chown_terminal(int fd, uid_t uid) {
 743         int r;
 744
 745         assert(fd >= 0);
 746
 747         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 748         if (isatty(fd) < 1) {
 749                 if (IN_SET(errno, EINVAL, ENOTTY))
 750                         return 0; /* not a tty */
 751
 752                 return -errno;
 753         }
 754
 755         /* This might fail. What matters are the results. */
 756         r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
 757         if (r < 0)
 758                 return r;
 759
 760         return 1;
 761 }
 762
 763 static int setup_confirm_stdio(
 764                 const ExecContext *context,
 765                 const char *vc,
 766                 int *ret_saved_stdin,
 767                 int *ret_saved_stdout) {
 768
 769         _cleanup_close_ int fd = -EBADF, saved_stdin = -EBADF, saved_stdout = -EBADF;
 770         int r;
 771
 772         assert(ret_saved_stdin);
 773         assert(ret_saved_stdout);
 774
 775         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 776         if (saved_stdin < 0)
 777                 return -errno;
 778
 779         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
 780         if (saved_stdout < 0)
 781                 return -errno;
 782
 783         fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
 784         if (fd < 0)
 785                 return fd;
 786
 787         r = chown_terminal(fd, getuid());
 788         if (r < 0)
 789                 return r;
 790
 791         r = reset_terminal_fd(fd, true);
 792         if (r < 0)
 793                 return r;
 794
 795         r = terminal_set_size_fd(fd, vc, context->tty_rows, context->tty_cols);
 796         if (r < 0)
 797                 return r;
 798
 799         r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
 800         TAKE_FD(fd);
 801         if (r < 0)
 802                 return r;
 803
 804         *ret_saved_stdin = TAKE_FD(saved_stdin);
 805         *ret_saved_stdout = TAKE_FD(saved_stdout);
 806         return 0;
 807 }
 808
 809 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
 810         assert(err < 0);
 811
 812         if (err == -ETIMEDOUT)
 813                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
 814         else {
 815                 errno = -err;
 816                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
 817         }
 818 }
 819
 820 static void write_confirm_error(int err, const char *vc, const Unit *u) {
 821         _cleanup_close_ int fd = -EBADF;
 822
 823         assert(vc);
 824
 825         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 826         if (fd < 0)
 827                 return;
 828
 829         write_confirm_error_fd(err, fd, u);
 830 }
 831
 832 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 833         int r = 0;
 834
 835         assert(saved_stdin);
 836         assert(saved_stdout);
 837
 838         release_terminal();
 839
 840         if (*saved_stdin >= 0)
 841                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 842                         r = -errno;
 843
 844         if (*saved_stdout >= 0)
 845                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 846                         r = -errno;
 847
 848         *saved_stdin = safe_close(*saved_stdin);
 849         *saved_stdout = safe_close(*saved_stdout);
 850
 851         return r;
 852 }
 853
 854 enum {
 855         CONFIRM_PRETEND_FAILURE = -1,
 856         CONFIRM_PRETEND_SUCCESS =  0,
 857         CONFIRM_EXECUTE = 1,
 858 };
 859
 860 static int ask_for_confirmation(const ExecContext *context, const char *vc, Unit *u, const char *cmdline) {
 861         int saved_stdout = -1, saved_stdin = -1, r;
 862         _cleanup_free_ char *e = NULL;
 863         char c;
 864
 865         /* For any internal errors, assume a positive response. */
 866         r = setup_confirm_stdio(context, vc, &saved_stdin, &saved_stdout);
 867         if (r < 0) {
 868                 write_confirm_error(r, vc, u);
 869                 return CONFIRM_EXECUTE;
 870         }
 871
 872         /* confirm_spawn might have been disabled while we were sleeping. */
 873         if (manager_is_confirm_spawn_disabled(u->manager)) {
 874                 r = 1;
 875                 goto restore_stdio;
 876         }
 877
 878         e = ellipsize(cmdline, 60, 100);
 879         if (!e) {
 880                 log_oom();
 881                 r = CONFIRM_EXECUTE;
 882                 goto restore_stdio;
 883         }
 884
 885         for (;;) {
 886                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 887                 if (r < 0) {
 888                         write_confirm_error_fd(r, STDOUT_FILENO, u);
 889                         r = CONFIRM_EXECUTE;
 890                         goto restore_stdio;
 891                 }
 892
 893                 switch (c) {
 894                 case 'c':
 895                         printf("Resuming normal execution.\n");
 896                         manager_disable_confirm_spawn();
 897                         r = 1;
 898                         break;
 899                 case 'D':
 900                         unit_dump(u, stdout, "  ");
 901                         continue; /* ask again */
 902                 case 'f':
 903                         printf("Failing execution.\n");
 904                         r = CONFIRM_PRETEND_FAILURE;
 905                         break;
 906                 case 'h':
 907                         printf("  c - continue, proceed without asking anymore\n"
 908                                "  D - dump, show the state of the unit\n"
 909                                "  f - fail, don't execute the command and pretend it failed\n"
 910                                "  h - help\n"
 911                                "  i - info, show a short summary of the unit\n"
 912                                "  j - jobs, show jobs that are in progress\n"
 913                                "  s - skip, don't execute the command and pretend it succeeded\n"
 914                                "  y - yes, execute the command\n");
 915                         continue; /* ask again */
 916                 case 'i':
 917                         printf("  Description: %s\n"
 918                                "  Unit:        %s\n"
 919                                "  Command:     %s\n",
 920                                u->id, u->description, cmdline);
 921                         continue; /* ask again */
 922                 case 'j':
 923                         manager_dump_jobs(u->manager, stdout, /* patterns= */ NULL, "  ");
 924                         continue; /* ask again */
 925                 case 'n':
 926                         /* 'n' was removed in favor of 'f'. */
 927                         printf("Didn't understand 'n', did you mean 'f'?\n");
 928                         continue; /* ask again */
 929                 case 's':
 930                         printf("Skipping execution.\n");
 931                         r = CONFIRM_PRETEND_SUCCESS;
 932                         break;
 933                 case 'y':
 934                         r = CONFIRM_EXECUTE;
 935                         break;
 936                 default:
 937                         assert_not_reached();
 938                 }
 939                 break;
 940         }
 941
 942 restore_stdio:
 943         restore_confirm_stdio(&saved_stdin, &saved_stdout);
 944         return r;
 945 }
 946
 947 static int get_fixed_user(const ExecContext *c, const char **user,
 948                           uid_t *uid, gid_t *gid,
 949                           const char **home, const char **shell) {
 950         int r;
 951         const char *name;
 952
 953         assert(c);
 954
 955         if (!c->user)
 956                 return 0;
 957
 958         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
 959          * (i.e. are "/" or "/bin/nologin"). */
 960
 961         name = c->user;
 962         r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
 963         if (r < 0)
 964                 return r;
 965
 966         *user = name;
 967         return 0;
 968 }
 969
 970 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
 971         int r;
 972         const char *name;
 973
 974         assert(c);
 975
 976         if (!c->group)
 977                 return 0;
 978
 979         name = c->group;
 980         r = get_group_creds(&name, gid, 0);
 981         if (r < 0)
 982                 return r;
 983
 984         *group = name;
 985         return 0;
 986 }
 987
 988 static int get_supplementary_groups(const ExecContext *c, const char *user,
 989                                     const char *group, gid_t gid,
 990                                     gid_t **supplementary_gids, int *ngids) {
 991         int r, k = 0;
 992         int ngroups_max;
 993         bool keep_groups = false;
 994         gid_t *groups = NULL;
 995         _cleanup_free_ gid_t *l_gids = NULL;
 996
 997         assert(c);
 998
 999         /*
1000          * If user is given, then lookup GID and supplementary groups list.
1001          * We avoid NSS lookups for gid=0. Also we have to initialize groups
1002          * here and as early as possible so we keep the list of supplementary
1003          * groups of the caller.
1004          */
1005         if (user && gid_is_valid(gid) && gid != 0) {
1006                 /* First step, initialize groups from /etc/groups */
1007                 if (initgroups(user, gid) < 0)
1008                         return -errno;
1009
1010                 keep_groups = true;
1011         }
1012
1013         if (strv_isempty(c->supplementary_groups))
1014                 return 0;
1015
1016         /*
1017          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1018          * be positive, otherwise fail.
1019          */
1020         errno = 0;
1021         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1022         if (ngroups_max <= 0)
1023                 return errno_or_else(EOPNOTSUPP);
1024
1025         l_gids = new(gid_t, ngroups_max);
1026         if (!l_gids)
1027                 return -ENOMEM;
1028
1029         if (keep_groups) {
1030                 /*
1031                  * Lookup the list of groups that the user belongs to, we
1032                  * avoid NSS lookups here too for gid=0.
1033                  */
1034                 k = ngroups_max;
1035                 if (getgrouplist(user, gid, l_gids, &k) < 0)
1036                         return -EINVAL;
1037         } else
1038                 k = 0;
1039
1040         STRV_FOREACH(i, c->supplementary_groups) {
1041                 const char *g;
1042
1043                 if (k >= ngroups_max)
1044                         return -E2BIG;
1045
1046                 g = *i;
1047                 r = get_group_creds(&g, l_gids+k, 0);
1048                 if (r < 0)
1049                         return r;
1050
1051                 k++;
1052         }
1053
1054         /*
1055          * Sets ngids to zero to drop all supplementary groups, happens
1056          * when we are under root and SupplementaryGroups= is empty.
1057          */
1058         if (k == 0) {
1059                 *ngids = 0;
1060                 return 0;
1061         }
1062
1063         /* Otherwise get the final list of supplementary groups */
1064         groups = memdup(l_gids, sizeof(gid_t) * k);
1065         if (!groups)
1066                 return -ENOMEM;
1067
1068         *supplementary_gids = groups;
1069         *ngids = k;
1070
1071         groups = NULL;
1072
1073         return 0;
1074 }
1075
1076 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1077         int r;
1078
1079         /* Handle SupplementaryGroups= if it is not empty */
1080         if (ngids > 0) {
1081                 r = maybe_setgroups(ngids, supplementary_gids);
1082                 if (r < 0)
1083                         return r;
1084         }
1085
1086         if (gid_is_valid(gid)) {
1087                 /* Then set our gids */
1088                 if (setresgid(gid, gid, gid) < 0)
1089                         return -errno;
1090         }
1091
1092         return 0;
1093 }
1094
1095 static int set_securebits(int bits, int mask) {
1096         int current, applied;
1097         current = prctl(PR_GET_SECUREBITS);
1098         if (current < 0)
1099                 return -errno;
1100         /* Clear all securebits defined in mask and set bits */
1101         applied = (current & ~mask) | bits;
1102         if (current == applied)
1103                 return 0;
1104         if (prctl(PR_SET_SECUREBITS, applied) < 0)
1105                 return -errno;
1106         return 1;
1107 }
1108
1109 static int enforce_user(const ExecContext *context, uid_t uid) {
1110         assert(context);
1111         int r;
1112
1113         if (!uid_is_valid(uid))
1114                 return 0;
1115
1116         /* Sets (but doesn't look up) the uid and make sure we keep the
1117          * capabilities while doing so. For setting secure bits the capability CAP_SETPCAP is
1118          * required, so we also need keep-caps in this case.
1119          */
1120
1121         if (context->capability_ambient_set != 0 || context->secure_bits != 0) {
1122
1123                 /* First step: If we need to keep capabilities but
1124                  * drop privileges we need to make sure we keep our
1125                  * caps, while we drop privileges. */
1126                 if (uid != 0) {
1127                         /* Add KEEP_CAPS to the securebits */
1128                         r = set_securebits(1<<SECURE_KEEP_CAPS, 0);
1129                         if (r < 0)
1130                                 return r;
1131                 }
1132         }
1133
1134         /* Second step: actually set the uids */
1135         if (setresuid(uid, uid, uid) < 0)
1136                 return -errno;
1137
1138         /* At this point we should have all necessary capabilities but
1139            are otherwise a normal user. However, the caps might got
1140            corrupted due to the setresuid() so we need clean them up
1141            later. This is done outside of this call. */
1142
1143         return 0;
1144 }
1145
1146 #if HAVE_PAM
1147
1148 static int null_conv(
1149                 int num_msg,
1150                 const struct pam_message **msg,
1151                 struct pam_response **resp,
1152                 void *appdata_ptr) {
1153
1154         /* We don't support conversations */
1155
1156         return PAM_CONV_ERR;
1157 }
1158
1159 #endif
1160
1161 static int setup_pam(
1162                 const char *name,
1163                 const char *user,
1164                 uid_t uid,
1165                 gid_t gid,
1166                 const char *tty,
1167                 char ***env, /* updated on success */
1168                 const int fds[], size_t n_fds) {
1169
1170 #if HAVE_PAM
1171
1172         static const struct pam_conv conv = {
1173                 .conv = null_conv,
1174                 .appdata_ptr = NULL
1175         };
1176
1177         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1178         _cleanup_strv_free_ char **e = NULL;
1179         pam_handle_t *handle = NULL;
1180         sigset_t old_ss;
1181         int pam_code = PAM_SUCCESS, r;
1182         bool close_session = false;
1183         pid_t pam_pid = 0, parent_pid;
1184         int flags = 0;
1185
1186         assert(name);
1187         assert(user);
1188         assert(env);
1189
1190         /* We set up PAM in the parent process, then fork. The child
1191          * will then stay around until killed via PR_GET_PDEATHSIG or
1192          * systemd via the cgroup logic. It will then remove the PAM
1193          * session again. The parent process will exec() the actual
1194          * daemon. We do things this way to ensure that the main PID
1195          * of the daemon is the one we initially fork()ed. */
1196
1197         r = barrier_create(&barrier);
1198         if (r < 0)
1199                 goto fail;
1200
1201         if (log_get_max_level() < LOG_DEBUG)
1202                 flags |= PAM_SILENT;
1203
1204         pam_code = pam_start(name, user, &conv, &handle);
1205         if (pam_code != PAM_SUCCESS) {
1206                 handle = NULL;
1207                 goto fail;
1208         }
1209
1210         if (!tty) {
1211                 _cleanup_free_ char *q = NULL;
1212
1213                 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1214                  * out if that's the case, and read the TTY off it. */
1215
1216                 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1217                         tty = strjoina("/dev/", q);
1218         }
1219
1220         if (tty) {
1221                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1222                 if (pam_code != PAM_SUCCESS)
1223                         goto fail;
1224         }
1225
1226         STRV_FOREACH(nv, *env) {
1227                 pam_code = pam_putenv(handle, *nv);
1228                 if (pam_code != PAM_SUCCESS)
1229                         goto fail;
1230         }
1231
1232         pam_code = pam_acct_mgmt(handle, flags);
1233         if (pam_code != PAM_SUCCESS)
1234                 goto fail;
1235
1236         pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1237         if (pam_code != PAM_SUCCESS)
1238                 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
1239
1240         pam_code = pam_open_session(handle, flags);
1241         if (pam_code != PAM_SUCCESS)
1242                 goto fail;
1243
1244         close_session = true;
1245
1246         e = pam_getenvlist(handle);
1247         if (!e) {
1248                 pam_code = PAM_BUF_ERR;
1249                 goto fail;
1250         }
1251
1252         /* Block SIGTERM, so that we know that it won't get lost in the child */
1253
1254         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1255
1256         parent_pid = getpid_cached();
1257
1258         r = safe_fork("(sd-pam)", 0, &pam_pid);
1259         if (r < 0)
1260                 goto fail;
1261         if (r == 0) {
1262                 int sig, ret = EXIT_PAM;
1263
1264                 /* The child's job is to reset the PAM session on termination */
1265                 barrier_set_role(&barrier, BARRIER_CHILD);
1266
1267                 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1268                  * those fds are open here that have been opened by PAM. */
1269                 (void) close_many(fds, n_fds);
1270
1271                 /* Drop privileges - we don't need any to pam_close_session and this will make
1272                  * PR_SET_PDEATHSIG work in most cases.  If this fails, ignore the error - but expect sd-pam
1273                  * threads to fail to exit normally */
1274
1275                 r = maybe_setgroups(0, NULL);
1276                 if (r < 0)
1277                         log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1278                 if (setresgid(gid, gid, gid) < 0)
1279                         log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1280                 if (setresuid(uid, uid, uid) < 0)
1281                         log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1282
1283                 (void) ignore_signals(SIGPIPE);
1284
1285                 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1286                  * otherwise the kernel will not allow unprivileged parents kill their privileged children
1287                  * this way. We rely on the control groups kill logic to do the rest for us. */
1288                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1289                         goto child_finish;
1290
1291                 /* Tell the parent that our setup is done. This is especially important regarding dropping
1292                  * privileges. Otherwise, unit setup might race against our setresuid(2) call.
1293                  *
1294                  * If the parent aborted, we'll detect this below, hence ignore return failure here. */
1295                 (void) barrier_place(&barrier);
1296
1297                 /* Check if our parent process might already have died? */
1298                 if (getppid() == parent_pid) {
1299                         sigset_t ss;
1300
1301                         assert_se(sigemptyset(&ss) >= 0);
1302                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1303
1304                         for (;;) {
1305                                 if (sigwait(&ss, &sig) < 0) {
1306                                         if (errno == EINTR)
1307                                                 continue;
1308
1309                                         goto child_finish;
1310                                 }
1311
1312                                 assert(sig == SIGTERM);
1313                                 break;
1314                         }
1315                 }
1316
1317                 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1318                 if (pam_code != PAM_SUCCESS)
1319                         goto child_finish;
1320
1321                 /* If our parent died we'll end the session */
1322                 if (getppid() != parent_pid) {
1323                         pam_code = pam_close_session(handle, flags);
1324                         if (pam_code != PAM_SUCCESS)
1325                                 goto child_finish;
1326                 }
1327
1328                 ret = 0;
1329
1330         child_finish:
1331                 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1332                  * know about this. See pam_end(3) */
1333                 (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
1334                 _exit(ret);
1335         }
1336
1337         barrier_set_role(&barrier, BARRIER_PARENT);
1338
1339         /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1340          * here. */
1341         handle = NULL;
1342
1343         /* Unblock SIGTERM again in the parent */
1344         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1345
1346         /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1347          * this fd around. */
1348         closelog();
1349
1350         /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1351          * recover. However, warn loudly if it happens. */
1352         if (!barrier_place_and_sync(&barrier))
1353                 log_error("PAM initialization failed");
1354
1355         return strv_free_and_replace(*env, e);
1356
1357 fail:
1358         if (pam_code != PAM_SUCCESS) {
1359                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1360                 r = -EPERM;  /* PAM errors do not map to errno */
1361         } else
1362                 log_error_errno(r, "PAM failed: %m");
1363
1364         if (handle) {
1365                 if (close_session)
1366                         pam_code = pam_close_session(handle, flags);
1367
1368                 (void) pam_end(handle, pam_code | flags);
1369         }
1370
1371         closelog();
1372         return r;
1373 #else
1374         return 0;
1375 #endif
1376 }
1377
1378 static void rename_process_from_path(const char *path) {
1379         char process_name[11];
1380         const char *p;
1381         size_t l;
1382
1383         /* This resulting string must fit in 10 chars (i.e. the length
1384          * of "/sbin/init") to look pretty in /bin/ps */
1385
1386         p = basename(path);
1387         if (isempty(p)) {
1388                 rename_process("(...)");
1389                 return;
1390         }
1391
1392         l = strlen(p);
1393         if (l > 8) {
1394                 /* The end of the process name is usually more
1395                  * interesting, since the first bit might just be
1396                  * "systemd-" */
1397                 p = p + l - 8;
1398                 l = 8;
1399         }
1400
1401         process_name[0] = '(';
1402         memcpy(process_name+1, p, l);
1403         process_name[1+l] = ')';
1404         process_name[1+l+1] = 0;
1405
1406         rename_process(process_name);
1407 }
1408
1409 static bool context_has_address_families(const ExecContext *c) {
1410         assert(c);
1411
1412         return c->address_families_allow_list ||
1413                 !set_isempty(c->address_families);
1414 }
1415
1416 static bool context_has_syscall_filters(const ExecContext *c) {
1417         assert(c);
1418
1419         return c->syscall_allow_list ||
1420                 !hashmap_isempty(c->syscall_filter);
1421 }
1422
1423 static bool context_has_syscall_logs(const ExecContext *c) {
1424         assert(c);
1425
1426         return c->syscall_log_allow_list ||
1427                 !hashmap_isempty(c->syscall_log);
1428 }
1429
1430 static bool context_has_no_new_privileges(const ExecContext *c) {
1431         assert(c);
1432
1433         if (c->no_new_privileges)
1434                 return true;
1435
1436         if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1437                 return false;
1438
1439         /* We need NNP if we have any form of seccomp and are unprivileged */
1440         return c->lock_personality ||
1441                 c->memory_deny_write_execute ||
1442                 c->private_devices ||
1443                 c->protect_clock ||
1444                 c->protect_hostname ||
1445                 c->protect_kernel_tunables ||
1446                 c->protect_kernel_modules ||
1447                 c->protect_kernel_logs ||
1448                 context_has_address_families(c) ||
1449                 exec_context_restrict_namespaces_set(c) ||
1450                 c->restrict_realtime ||
1451                 c->restrict_suid_sgid ||
1452                 !set_isempty(c->syscall_archs) ||
1453                 context_has_syscall_filters(c) ||
1454                 context_has_syscall_logs(c);
1455 }
1456
1457 static bool exec_context_has_credentials(const ExecContext *context) {
1458
1459         assert(context);
1460
1461         return !hashmap_isempty(context->set_credentials) ||
1462                 !hashmap_isempty(context->load_credentials);
1463 }
1464
1465 #if HAVE_SECCOMP
1466
1467 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1468
1469         if (is_seccomp_available())
1470                 return false;
1471
1472         log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1473         return true;
1474 }
1475
1476 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1477         uint32_t negative_action, default_action, action;
1478         int r;
1479
1480         assert(u);
1481         assert(c);
1482
1483         if (!context_has_syscall_filters(c))
1484                 return 0;
1485
1486         if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1487                 return 0;
1488
1489         negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1490
1491         if (c->syscall_allow_list) {
1492                 default_action = negative_action;
1493                 action = SCMP_ACT_ALLOW;
1494         } else {
1495                 default_action = SCMP_ACT_ALLOW;
1496                 action = negative_action;
1497         }
1498
1499         if (needs_ambient_hack) {
1500                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1501                 if (r < 0)
1502                         return r;
1503         }
1504
1505         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1506 }
1507
1508 static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1509 #ifdef SCMP_ACT_LOG
1510         uint32_t default_action, action;
1511 #endif
1512
1513         assert(u);
1514         assert(c);
1515
1516         if (!context_has_syscall_logs(c))
1517                 return 0;
1518
1519 #ifdef SCMP_ACT_LOG
1520         if (skip_seccomp_unavailable(u, "SystemCallLog="))
1521                 return 0;
1522
1523         if (c->syscall_log_allow_list) {
1524                 /* Log nothing but the ones listed */
1525                 default_action = SCMP_ACT_ALLOW;
1526                 action = SCMP_ACT_LOG;
1527         } else {
1528                 /* Log everything but the ones listed */
1529                 default_action = SCMP_ACT_LOG;
1530                 action = SCMP_ACT_ALLOW;
1531         }
1532
1533         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1534 #else
1535         /* old libseccomp */
1536         log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1537         return 0;
1538 #endif
1539 }
1540
1541 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1542         assert(u);
1543         assert(c);
1544
1545         if (set_isempty(c->syscall_archs))
1546                 return 0;
1547
1548         if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1549                 return 0;
1550
1551         return seccomp_restrict_archs(c->syscall_archs);
1552 }
1553
1554 static int apply_address_families(const Unit* u, const ExecContext *c) {
1555         assert(u);
1556         assert(c);
1557
1558         if (!context_has_address_families(c))
1559                 return 0;
1560
1561         if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1562                 return 0;
1563
1564         return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1565 }
1566
1567 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1568         assert(u);
1569         assert(c);
1570
1571         if (!c->memory_deny_write_execute)
1572                 return 0;
1573
1574         if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1575                 return 0;
1576
1577         return seccomp_memory_deny_write_execute();
1578 }
1579
1580 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1581         assert(u);
1582         assert(c);
1583
1584         if (!c->restrict_realtime)
1585                 return 0;
1586
1587         if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1588                 return 0;
1589
1590         return seccomp_restrict_realtime();
1591 }
1592
1593 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1594         assert(u);
1595         assert(c);
1596
1597         if (!c->restrict_suid_sgid)
1598                 return 0;
1599
1600         if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1601                 return 0;
1602
1603         return seccomp_restrict_suid_sgid();
1604 }
1605
1606 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1607         assert(u);
1608         assert(c);
1609
1610         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1611          * let's protect even those systems where this is left on in the kernel. */
1612
1613         if (!c->protect_kernel_tunables)
1614                 return 0;
1615
1616         if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1617                 return 0;
1618
1619         return seccomp_protect_sysctl();
1620 }
1621
1622 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1623         assert(u);
1624         assert(c);
1625
1626         /* Turn off module syscalls on ProtectKernelModules=yes */
1627
1628         if (!c->protect_kernel_modules)
1629                 return 0;
1630
1631         if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1632                 return 0;
1633
1634         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1635 }
1636
1637 static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1638         assert(u);
1639         assert(c);
1640
1641         if (!c->protect_kernel_logs)
1642                 return 0;
1643
1644         if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1645                 return 0;
1646
1647         return seccomp_protect_syslog();
1648 }
1649
1650 static int apply_protect_clock(const Unit *u, const ExecContext *c) {
1651         assert(u);
1652         assert(c);
1653
1654         if (!c->protect_clock)
1655                 return 0;
1656
1657         if (skip_seccomp_unavailable(u, "ProtectClock="))
1658                 return 0;
1659
1660         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1661 }
1662
1663 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1664         assert(u);
1665         assert(c);
1666
1667         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1668
1669         if (!c->private_devices)
1670                 return 0;
1671
1672         if (skip_seccomp_unavailable(u, "PrivateDevices="))
1673                 return 0;
1674
1675         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1676 }
1677
1678 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1679         assert(u);
1680         assert(c);
1681
1682         if (!exec_context_restrict_namespaces_set(c))
1683                 return 0;
1684
1685         if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1686                 return 0;
1687
1688         return seccomp_restrict_namespaces(c->restrict_namespaces);
1689 }
1690
1691 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1692         unsigned long personality;
1693         int r;
1694
1695         assert(u);
1696         assert(c);
1697
1698         if (!c->lock_personality)
1699                 return 0;
1700
1701         if (skip_seccomp_unavailable(u, "LockPersonality="))
1702                 return 0;
1703
1704         personality = c->personality;
1705
1706         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1707         if (personality == PERSONALITY_INVALID) {
1708
1709                 r = opinionated_personality(&personality);
1710                 if (r < 0)
1711                         return r;
1712         }
1713
1714         return seccomp_lock_personality(personality);
1715 }
1716
1717 #endif
1718
1719 #if HAVE_LIBBPF
1720 static int apply_restrict_filesystems(Unit *u, const ExecContext *c) {
1721         assert(u);
1722         assert(c);
1723
1724         if (!exec_context_restrict_filesystems_set(c))
1725                 return 0;
1726
1727         if (!u->manager->restrict_fs) {
1728                 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1729                 log_unit_debug(u, "LSM BPF not supported, skipping RestrictFileSystems=");
1730                 return 0;
1731         }
1732
1733         return lsm_bpf_unit_restrict_filesystems(u, c->restrict_filesystems, c->restrict_filesystems_allow_list);
1734 }
1735 #endif
1736
1737 static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
1738         assert(u);
1739         assert(c);
1740
1741         if (!c->protect_hostname)
1742                 return 0;
1743
1744         if (ns_type_supported(NAMESPACE_UTS)) {
1745                 if (unshare(CLONE_NEWUTS) < 0) {
1746                         if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1747                                 *ret_exit_status = EXIT_NAMESPACE;
1748                                 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1749                         }
1750
1751                         log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1752                 }
1753         } else
1754                 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1755
1756 #if HAVE_SECCOMP
1757         int r;
1758
1759         if (skip_seccomp_unavailable(u, "ProtectHostname="))
1760                 return 0;
1761
1762         r = seccomp_protect_hostname();
1763         if (r < 0) {
1764                 *ret_exit_status = EXIT_SECCOMP;
1765                 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1766         }
1767 #endif
1768
1769         return 0;
1770 }
1771
1772 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1773         assert(idle_pipe);
1774
1775         idle_pipe[1] = safe_close(idle_pipe[1]);
1776         idle_pipe[2] = safe_close(idle_pipe[2]);
1777
1778         if (idle_pipe[0] >= 0) {
1779                 int r;
1780
1781                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1782
1783                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1784                         ssize_t n;
1785
1786                         /* Signal systemd that we are bored and want to continue. */
1787                         n = write(idle_pipe[3], "x", 1);
1788                         if (n > 0)
1789                                 /* Wait for systemd to react to the signal above. */
1790                                 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1791                 }
1792
1793                 idle_pipe[0] = safe_close(idle_pipe[0]);
1794
1795         }
1796
1797         idle_pipe[3] = safe_close(idle_pipe[3]);
1798 }
1799
1800 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1801
1802 static int build_environment(
1803                 const Unit *u,
1804                 const ExecContext *c,
1805                 const ExecParameters *p,
1806                 size_t n_fds,
1807                 const char *home,
1808                 const char *username,
1809                 const char *shell,
1810                 dev_t journal_stream_dev,
1811                 ino_t journal_stream_ino,
1812                 char ***ret) {
1813
1814         _cleanup_strv_free_ char **our_env = NULL;
1815         size_t n_env = 0;
1816         char *x;
1817
1818         assert(u);
1819         assert(c);
1820         assert(p);
1821         assert(ret);
1822
1823 #define N_ENV_VARS 17
1824         our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1825         if (!our_env)
1826                 return -ENOMEM;
1827
1828         if (n_fds > 0) {
1829                 _cleanup_free_ char *joined = NULL;
1830
1831                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1832                         return -ENOMEM;
1833                 our_env[n_env++] = x;
1834
1835                 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1836                         return -ENOMEM;
1837                 our_env[n_env++] = x;
1838
1839                 joined = strv_join(p->fd_names, ":");
1840                 if (!joined)
1841                         return -ENOMEM;
1842
1843                 x = strjoin("LISTEN_FDNAMES=", joined);
1844                 if (!x)
1845                         return -ENOMEM;
1846                 our_env[n_env++] = x;
1847         }
1848
1849         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1850                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1851                         return -ENOMEM;
1852                 our_env[n_env++] = x;
1853
1854                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1855                         return -ENOMEM;
1856                 our_env[n_env++] = x;
1857         }
1858
1859         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1860          * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1861          * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1862         if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
1863                 x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
1864                 if (!x)
1865                         return -ENOMEM;
1866                 our_env[n_env++] = x;
1867         }
1868
1869         if (home) {
1870                 x = strjoin("HOME=", home);
1871                 if (!x)
1872                         return -ENOMEM;
1873
1874                 path_simplify(x + 5);
1875                 our_env[n_env++] = x;
1876         }
1877
1878         if (username) {
1879                 x = strjoin("LOGNAME=", username);
1880                 if (!x)
1881                         return -ENOMEM;
1882                 our_env[n_env++] = x;
1883
1884                 x = strjoin("USER=", username);
1885                 if (!x)
1886                         return -ENOMEM;
1887                 our_env[n_env++] = x;
1888         }
1889
1890         if (shell) {
1891                 x = strjoin("SHELL=", shell);
1892                 if (!x)
1893                         return -ENOMEM;
1894
1895                 path_simplify(x + 6);
1896                 our_env[n_env++] = x;
1897         }
1898
1899         if (!sd_id128_is_null(u->invocation_id)) {
1900                 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1901                         return -ENOMEM;
1902
1903                 our_env[n_env++] = x;
1904         }
1905
1906         if (exec_context_needs_term(c)) {
1907                 const char *tty_path, *term = NULL;
1908
1909                 tty_path = exec_context_tty_path(c);
1910
1911                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1912                  * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1913                  * container manager passes to PID 1 ends up all the way in the console login shown. */
1914
1915                 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
1916                         term = getenv("TERM");
1917
1918                 if (!term)
1919                         term = default_term_for_tty(tty_path);
1920
1921                 x = strjoin("TERM=", term);
1922                 if (!x)
1923                         return -ENOMEM;
1924                 our_env[n_env++] = x;
1925         }
1926
1927         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1928                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1929                         return -ENOMEM;
1930
1931                 our_env[n_env++] = x;
1932         }
1933
1934         if (c->log_namespace) {
1935                 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1936                 if (!x)
1937                         return -ENOMEM;
1938
1939                 our_env[n_env++] = x;
1940         }
1941
1942         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1943                 _cleanup_free_ char *joined = NULL;
1944                 const char *n;
1945
1946                 if (!p->prefix[t])
1947                         continue;
1948
1949                 if (c->directories[t].n_items == 0)
1950                         continue;
1951
1952                 n = exec_directory_env_name_to_string(t);
1953                 if (!n)
1954                         continue;
1955
1956                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
1957                         _cleanup_free_ char *prefixed = NULL;
1958
1959                         prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
1960                         if (!prefixed)
1961                                 return -ENOMEM;
1962
1963                         if (!strextend_with_separator(&joined, ":", prefixed))
1964                                 return -ENOMEM;
1965                 }
1966
1967                 x = strjoin(n, "=", joined);
1968                 if (!x)
1969                         return -ENOMEM;
1970
1971                 our_env[n_env++] = x;
1972         }
1973
1974         if (exec_context_has_credentials(c) && p->prefix[EXEC_DIRECTORY_RUNTIME]) {
1975                 x = strjoin("CREDENTIALS_DIRECTORY=", p->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
1976                 if (!x)
1977                         return -ENOMEM;
1978
1979                 our_env[n_env++] = x;
1980         }
1981
1982         if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
1983                 return -ENOMEM;
1984
1985         our_env[n_env++] = x;
1986
1987         our_env[n_env++] = NULL;
1988         assert(n_env <= N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1989 #undef N_ENV_VARS
1990
1991         *ret = TAKE_PTR(our_env);
1992
1993         return 0;
1994 }
1995
1996 static int build_pass_environment(const ExecContext *c, char ***ret) {
1997         _cleanup_strv_free_ char **pass_env = NULL;
1998         size_t n_env = 0;
1999
2000         STRV_FOREACH(i, c->pass_environment) {
2001                 _cleanup_free_ char *x = NULL;
2002                 char *v;
2003
2004                 v = getenv(*i);
2005                 if (!v)
2006                         continue;
2007                 x = strjoin(*i, "=", v);
2008                 if (!x)
2009                         return -ENOMEM;
2010
2011                 if (!GREEDY_REALLOC(pass_env, n_env + 2))
2012                         return -ENOMEM;
2013
2014                 pass_env[n_env++] = TAKE_PTR(x);
2015                 pass_env[n_env] = NULL;
2016         }
2017
2018         *ret = TAKE_PTR(pass_env);
2019
2020         return 0;
2021 }
2022
2023 bool exec_needs_mount_namespace(
2024                 const ExecContext *context,
2025                 const ExecParameters *params,
2026                 const ExecRuntime *runtime) {
2027
2028         assert(context);
2029
2030         if (context->root_image)
2031                 return true;
2032
2033         if (!strv_isempty(context->read_write_paths) ||
2034             !strv_isempty(context->read_only_paths) ||
2035             !strv_isempty(context->inaccessible_paths) ||
2036             !strv_isempty(context->exec_paths) ||
2037             !strv_isempty(context->no_exec_paths))
2038                 return true;
2039
2040         if (context->n_bind_mounts > 0)
2041                 return true;
2042
2043         if (context->n_temporary_filesystems > 0)
2044                 return true;
2045
2046         if (context->n_mount_images > 0)
2047                 return true;
2048
2049         if (context->n_extension_images > 0)
2050                 return true;
2051
2052         if (!strv_isempty(context->extension_directories))
2053                 return true;
2054
2055         if (!IN_SET(context->mount_flags, 0, MS_SHARED))
2056                 return true;
2057
2058         if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
2059                 return true;
2060
2061         if (context->private_devices ||
2062             context->private_mounts ||
2063             context->protect_system != PROTECT_SYSTEM_NO ||
2064             context->protect_home != PROTECT_HOME_NO ||
2065             context->protect_kernel_tunables ||
2066             context->protect_kernel_modules ||
2067             context->protect_kernel_logs ||
2068             context->protect_control_groups ||
2069             context->protect_proc != PROTECT_PROC_DEFAULT ||
2070             context->proc_subset != PROC_SUBSET_ALL ||
2071             context->private_ipc ||
2072             context->ipc_namespace_path)
2073                 return true;
2074
2075         if (context->root_directory) {
2076                 if (exec_context_get_effective_mount_apivfs(context))
2077                         return true;
2078
2079                 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2080                         if (params && !params->prefix[t])
2081                                 continue;
2082
2083                         if (context->directories[t].n_items > 0)
2084                                 return true;
2085                 }
2086         }
2087
2088         if (context->dynamic_user &&
2089             (context->directories[EXEC_DIRECTORY_STATE].n_items > 0 ||
2090              context->directories[EXEC_DIRECTORY_CACHE].n_items > 0 ||
2091              context->directories[EXEC_DIRECTORY_LOGS].n_items > 0))
2092                 return true;
2093
2094         if (context->log_namespace)
2095                 return true;
2096
2097         return false;
2098 }
2099
2100 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
2101         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2102         _cleanup_close_pair_ int errno_pipe[2] = { -EBADF, -EBADF };
2103         _cleanup_close_ int unshare_ready_fd = -EBADF;
2104         _cleanup_(sigkill_waitp) pid_t pid = 0;
2105         uint64_t c = 1;
2106         ssize_t n;
2107         int r;
2108
2109         /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2110          * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2111          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2112          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2113          * which waits for the parent to create the new user namespace while staying in the original namespace. The
2114          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2115          * continues execution normally.
2116          * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2117          * does not need CAP_SETUID to write the single line mapping to itself. */
2118
2119         /* Can only set up multiple mappings with CAP_SETUID. */
2120         if (have_effective_cap(CAP_SETUID) && uid != ouid && uid_is_valid(uid))
2121                 r = asprintf(&uid_map,
2122                              UID_FMT " " UID_FMT " 1\n"     /* Map $OUID → $OUID */
2123                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
2124                              ouid, ouid, uid, uid);
2125         else
2126                 r = asprintf(&uid_map,
2127                              UID_FMT " " UID_FMT " 1\n",    /* Map $OUID → $OUID */
2128                              ouid, ouid);
2129
2130         if (r < 0)
2131                 return -ENOMEM;
2132
2133         /* Can only set up multiple mappings with CAP_SETGID. */
2134         if (have_effective_cap(CAP_SETGID) && gid != ogid && gid_is_valid(gid))
2135                 r = asprintf(&gid_map,
2136                              GID_FMT " " GID_FMT " 1\n"     /* Map $OGID → $OGID */
2137                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
2138                              ogid, ogid, gid, gid);
2139         else
2140                 r = asprintf(&gid_map,
2141                              GID_FMT " " GID_FMT " 1\n",    /* Map $OGID -> $OGID */
2142                              ogid, ogid);
2143
2144         if (r < 0)
2145                 return -ENOMEM;
2146
2147         /* Create a communication channel so that the parent can tell the child when it finished creating the user
2148          * namespace. */
2149         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2150         if (unshare_ready_fd < 0)
2151                 return -errno;
2152
2153         /* Create a communication channel so that the child can tell the parent a proper error code in case it
2154          * failed. */
2155         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2156                 return -errno;
2157
2158         r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2159         if (r < 0)
2160                 return r;
2161         if (r == 0) {
2162                 _cleanup_close_ int fd = -EBADF;
2163                 const char *a;
2164                 pid_t ppid;
2165
2166                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2167                  * here, after the parent opened its own user namespace. */
2168
2169                 ppid = getppid();
2170                 errno_pipe[0] = safe_close(errno_pipe[0]);
2171
2172                 /* Wait until the parent unshared the user namespace */
2173                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2174                         r = -errno;
2175                         goto child_fail;
2176                 }
2177
2178                 /* Disable the setgroups() system call in the child user namespace, for good. */
2179                 a = procfs_file_alloca(ppid, "setgroups");
2180                 fd = open(a, O_WRONLY|O_CLOEXEC);
2181                 if (fd < 0) {
2182                         if (errno != ENOENT) {
2183                                 r = -errno;
2184                                 goto child_fail;
2185                         }
2186
2187                         /* If the file is missing the kernel is too old, let's continue anyway. */
2188                 } else {
2189                         if (write(fd, "deny\n", 5) < 0) {
2190                                 r = -errno;
2191                                 goto child_fail;
2192                         }
2193
2194                         fd = safe_close(fd);
2195                 }
2196
2197                 /* First write the GID map */
2198                 a = procfs_file_alloca(ppid, "gid_map");
2199                 fd = open(a, O_WRONLY|O_CLOEXEC);
2200                 if (fd < 0) {
2201                         r = -errno;
2202                         goto child_fail;
2203                 }
2204                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2205                         r = -errno;
2206                         goto child_fail;
2207                 }
2208                 fd = safe_close(fd);
2209
2210                 /* The write the UID map */
2211                 a = procfs_file_alloca(ppid, "uid_map");
2212                 fd = open(a, O_WRONLY|O_CLOEXEC);
2213                 if (fd < 0) {
2214                         r = -errno;
2215                         goto child_fail;
2216                 }
2217                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2218                         r = -errno;
2219                         goto child_fail;
2220                 }
2221
2222                 _exit(EXIT_SUCCESS);
2223
2224         child_fail:
2225                 (void) write(errno_pipe[1], &r, sizeof(r));
2226                 _exit(EXIT_FAILURE);
2227         }
2228
2229         errno_pipe[1] = safe_close(errno_pipe[1]);
2230
2231         if (unshare(CLONE_NEWUSER) < 0)
2232                 return -errno;
2233
2234         /* Let the child know that the namespace is ready now */
2235         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2236                 return -errno;
2237
2238         /* Try to read an error code from the child */
2239         n = read(errno_pipe[0], &r, sizeof(r));
2240         if (n < 0)
2241                 return -errno;
2242         if (n == sizeof(r)) { /* an error code was sent to us */
2243                 if (r < 0)
2244                         return r;
2245                 return -EIO;
2246         }
2247         if (n != 0) /* on success we should have read 0 bytes */
2248                 return -EIO;
2249
2250         r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
2251         if (r < 0)
2252                 return r;
2253         if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2254                 return -EIO;
2255
2256         return 0;
2257 }
2258
2259 static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2260         if (!context->dynamic_user)
2261                 return false;
2262
2263         if (type == EXEC_DIRECTORY_CONFIGURATION)
2264                 return false;
2265
2266         if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2267                 return false;
2268
2269         return true;
2270 }
2271
2272 static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2273         _cleanup_free_ char *src_abs = NULL;
2274         int r;
2275
2276         assert(source);
2277
2278         src_abs = path_join(root, source);
2279         if (!src_abs)
2280                 return -ENOMEM;
2281
2282         STRV_FOREACH(dst, symlinks) {
2283                 _cleanup_free_ char *dst_abs = NULL;
2284
2285                 dst_abs = path_join(root, *dst);
2286                 if (!dst_abs)
2287                         return -ENOMEM;
2288
2289                 r = mkdir_parents_label(dst_abs, 0755);
2290                 if (r < 0)
2291                         return r;
2292
2293                 r = symlink_idempotent(src_abs, dst_abs, true);
2294                 if (r < 0)
2295                         return r;
2296         }
2297
2298         return 0;
2299 }
2300
2301 static int setup_exec_directory(
2302                 const ExecContext *context,
2303                 const ExecParameters *params,
2304                 uid_t uid,
2305                 gid_t gid,
2306                 ExecDirectoryType type,
2307                 bool needs_mount_namespace,
2308                 int *exit_status) {
2309
2310         static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2311                 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2312                 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2313                 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2314                 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2315                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2316         };
2317         int r;
2318
2319         assert(context);
2320         assert(params);
2321         assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2322         assert(exit_status);
2323
2324         if (!params->prefix[type])
2325                 return 0;
2326
2327         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2328                 if (!uid_is_valid(uid))
2329                         uid = 0;
2330                 if (!gid_is_valid(gid))
2331                         gid = 0;
2332         }
2333
2334         for (size_t i = 0; i < context->directories[type].n_items; i++) {
2335                 _cleanup_free_ char *p = NULL, *pp = NULL;
2336
2337                 p = path_join(params->prefix[type], context->directories[type].items[i].path);
2338                 if (!p) {
2339                         r = -ENOMEM;
2340                         goto fail;
2341                 }
2342
2343                 r = mkdir_parents_label(p, 0755);
2344                 if (r < 0)
2345                         goto fail;
2346
2347                 if (exec_directory_is_private(context, type)) {
2348                         /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2349                          * case we want to avoid leaving a directory around fully accessible that is owned by
2350                          * a dynamic user whose UID is later on reused. To lock this down we use the same
2351                          * trick used by container managers to prohibit host users to get access to files of
2352                          * the same UID in containers: we place everything inside a directory that has an
2353                          * access mode of 0700 and is owned root:root, so that it acts as security boundary
2354                          * for unprivileged host code. We then use fs namespacing to make this directory
2355                          * permeable for the service itself.
2356                          *
2357                          * Specifically: for a service which wants a special directory "foo/" we first create
2358                          * a directory "private/" with access mode 0700 owned by root:root. Then we place
2359                          * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2360                          * "private/foo". This way, privileged host users can access "foo/" as usual, but
2361                          * unprivileged host users can't look into it. Inside of the namespace of the unit
2362                          * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2363                          * "private/foo/" is mounted under the same name, thus disabling the access boundary
2364                          * for the service and making sure it only gets access to the dirs it needs but no
2365                          * others. Tricky? Yes, absolutely, but it works!
2366                          *
2367                          * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2368                          * to be owned by the service itself.
2369                          *
2370                          * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2371                          * for sharing files or sockets with other services. */
2372
2373                         pp = path_join(params->prefix[type], "private");
2374                         if (!pp) {
2375                                 r = -ENOMEM;
2376                                 goto fail;
2377                         }
2378
2379                         /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2380                         r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
2381                         if (r < 0)
2382                                 goto fail;
2383
2384                         if (!path_extend(&pp, context->directories[type].items[i].path)) {
2385                                 r = -ENOMEM;
2386                                 goto fail;
2387                         }
2388
2389                         /* Create all directories between the configured directory and this private root, and mark them 0755 */
2390                         r = mkdir_parents_label(pp, 0755);
2391                         if (r < 0)
2392                                 goto fail;
2393
2394                         if (is_dir(p, false) > 0 &&
2395                             (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2396
2397                                 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2398                                  * it over. Most likely the service has been upgraded from one that didn't use
2399                                  * DynamicUser=1, to one that does. */
2400
2401                                 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2402                                          "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2403                                          exec_directory_type_to_string(type), p, pp);
2404
2405                                 if (rename(p, pp) < 0) {
2406                                         r = -errno;
2407                                         goto fail;
2408                                 }
2409                         } else {
2410                                 /* Otherwise, create the actual directory for the service */
2411
2412                                 r = mkdir_label(pp, context->directories[type].mode);
2413                                 if (r < 0 && r != -EEXIST)
2414                                         goto fail;
2415                         }
2416
2417                         if (!context->directories[type].items[i].only_create) {
2418                                 /* And link it up from the original place.
2419                                  * Notes
2420                                  * 1) If a mount namespace is going to be used, then this symlink remains on
2421                                  *    the host, and a new one for the child namespace will be created later.
2422                                  * 2) It is not necessary to create this symlink when one of its parent
2423                                  *    directories is specified and already created. E.g.
2424                                  *        StateDirectory=foo foo/bar
2425                                  *    In that case, the inode points to pp and p for "foo/bar" are the same:
2426                                  *        pp = "/var/lib/private/foo/bar"
2427                                  *        p = "/var/lib/foo/bar"
2428                                  *    and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2429                                  *    we do not need to create the symlink, but we cannot create the symlink.
2430                                  *    See issue #24783. */
2431                                 r = symlink_idempotent(pp, p, true);
2432                                 if (r < 0)
2433                                         goto fail;
2434                         }
2435
2436                 } else {
2437                         _cleanup_free_ char *target = NULL;
2438
2439                         if (type != EXEC_DIRECTORY_CONFIGURATION &&
2440                             readlink_and_make_absolute(p, &target) >= 0) {
2441                                 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2442
2443                                 /* This already exists and is a symlink? Interesting. Maybe it's one created
2444                                  * by DynamicUser=1 (see above)?
2445                                  *
2446                                  * We do this for all directory types except for ConfigurationDirectory=,
2447                                  * since they all support the private/ symlink logic at least in some
2448                                  * configurations, see above. */
2449
2450                                 r = chase_symlinks(target, NULL, 0, &target_resolved, NULL);
2451                                 if (r < 0)
2452                                         goto fail;
2453
2454                                 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
2455                                 if (!q) {
2456                                         r = -ENOMEM;
2457                                         goto fail;
2458                                 }
2459
2460                                 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2461                                 r = chase_symlinks(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2462                                 if (r < 0)
2463                                         goto fail;
2464
2465                                 if (path_equal(q_resolved, target_resolved)) {
2466
2467                                         /* Hmm, apparently DynamicUser= was once turned on for this service,
2468                                          * but is no longer. Let's move the directory back up. */
2469
2470                                         log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2471                                                  "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2472                                                  exec_directory_type_to_string(type), q, p);
2473
2474                                         if (unlink(p) < 0) {
2475                                                 r = -errno;
2476                                                 goto fail;
2477                                         }
2478
2479                                         if (rename(q, p) < 0) {
2480                                                 r = -errno;
2481                                                 goto fail;
2482                                         }
2483                                 }
2484                         }
2485
2486                         r = mkdir_label(p, context->directories[type].mode);
2487                         if (r < 0) {
2488                                 if (r != -EEXIST)
2489                                         goto fail;
2490
2491                                 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2492                                         struct stat st;
2493
2494                                         /* Don't change the owner/access mode of the configuration directory,
2495                                          * as in the common case it is not written to by a service, and shall
2496                                          * not be writable. */
2497
2498                                         if (stat(p, &st) < 0) {
2499                                                 r = -errno;
2500                                                 goto fail;
2501                                         }
2502
2503                                         /* Still complain if the access mode doesn't match */
2504                                         if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2505                                                 log_warning("%s \'%s\' already exists but the mode is different. "
2506                                                             "(File system: %o %sMode: %o)",
2507                                                             exec_directory_type_to_string(type), context->directories[type].items[i].path,
2508                                                             st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2509
2510                                         continue;
2511                                 }
2512                         }
2513                 }
2514
2515                 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2516                  * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2517                  * current UID/GID ownership.) */
2518                 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2519                 if (r < 0)
2520                         goto fail;
2521
2522                 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2523                  * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2524                  * assignments to exist. */
2525                 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
2526                 if (r < 0)
2527                         goto fail;
2528         }
2529
2530         /* If we are not going to run in a namespace, set up the symlinks - otherwise
2531          * they are set up later, to allow configuring empty var/run/etc. */
2532         if (!needs_mount_namespace)
2533                 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2534                         r = create_many_symlinks(params->prefix[type],
2535                                                  context->directories[type].items[i].path,
2536                                                  context->directories[type].items[i].symlinks);
2537                         if (r < 0)
2538                                 goto fail;
2539                 }
2540
2541         return 0;
2542
2543 fail:
2544         *exit_status = exit_status_table[type];
2545         return r;
2546 }
2547
2548 static int write_credential(
2549                 int dfd,
2550                 const char *id,
2551                 const void *data,
2552                 size_t size,
2553                 uid_t uid,
2554                 bool ownership_ok) {
2555
2556         _cleanup_(unlink_and_freep) char *tmp = NULL;
2557         _cleanup_close_ int fd = -EBADF;
2558         int r;
2559
2560         r = tempfn_random_child("", "cred", &tmp);
2561         if (r < 0)
2562                 return r;
2563
2564         fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
2565         if (fd < 0) {
2566                 tmp = mfree(tmp);
2567                 return -errno;
2568         }
2569
2570         r = loop_write(fd, data, size, /* do_poll = */ false);
2571         if (r < 0)
2572                 return r;
2573
2574         if (fchmod(fd, 0400) < 0) /* Take away "w" bit */
2575                 return -errno;
2576
2577         if (uid_is_valid(uid) && uid != getuid()) {
2578                 r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
2579                 if (r < 0) {
2580                         if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2581                                 return r;
2582
2583                         if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want
2584                                             * to express: that the user gets read access and nothing
2585                                             * else. But if the backing fs can't support that (e.g. ramfs)
2586                                             * then we can use file ownership instead. But that's only safe if
2587                                             * we can then re-mount the whole thing read-only, so that the
2588                                             * user can no longer chmod() the file to gain write access. */
2589                                 return r;
2590
2591                         if (fchown(fd, uid, GID_INVALID) < 0)
2592                                 return -errno;
2593                 }
2594         }
2595
2596         if (renameat(dfd, tmp, dfd, id) < 0)
2597                 return -errno;
2598
2599         tmp = mfree(tmp);
2600         return 0;
2601 }
2602
2603 static char **credential_search_path(
2604                 const ExecParameters *params,
2605                 bool encrypted) {
2606
2607         _cleanup_strv_free_ char **l = NULL;
2608
2609         assert(params);
2610
2611         /* Assemble a search path to find credentials in. We'll look in /etc/credstore/ (and similar
2612          * directories in /usr/lib/ + /run/) for all types of credentials. If we are looking for encrypted
2613          * credentials, also look in /etc/credstore.encrypted/ (and similar dirs). */
2614
2615         if (encrypted) {
2616                 if (strv_extend(&l, params->received_encrypted_credentials_directory) < 0)
2617                         return NULL;
2618
2619                 if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore.encrypted"), /* filter_duplicates= */ true) < 0)
2620                         return NULL;
2621         }
2622
2623         if (params->received_credentials_directory)
2624                 if (strv_extend(&l, params->received_credentials_directory) < 0)
2625                         return NULL;
2626
2627         if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore"), /* filter_duplicates= */ true) < 0)
2628                 return NULL;
2629
2630         if (DEBUG_LOGGING) {
2631                 _cleanup_free_ char *t = strv_join(l, ":");
2632
2633                 log_debug("Credential search path is: %s", t);
2634         }
2635
2636         return TAKE_PTR(l);
2637 }
2638
2639 static int load_credential(
2640                 const ExecContext *context,
2641                 const ExecParameters *params,
2642                 const char *id,
2643                 const char *path,
2644                 bool encrypted,
2645                 const char *unit,
2646                 int read_dfd,
2647                 int write_dfd,
2648                 uid_t uid,
2649                 bool ownership_ok,
2650                 uint64_t *left) {
2651
2652         ReadFullFileFlags flags = READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER;
2653         _cleanup_strv_free_ char **search_path = NULL;
2654         _cleanup_(erase_and_freep) char *data = NULL;
2655         _cleanup_free_ char *bindname = NULL;
2656         const char *source = NULL;
2657         bool missing_ok = true;
2658         size_t size, add, maxsz;
2659         int r;
2660
2661         assert(context);
2662         assert(params);
2663         assert(id);
2664         assert(path);
2665         assert(unit);
2666         assert(read_dfd >= 0 || read_dfd == AT_FDCWD);
2667         assert(write_dfd >= 0);
2668         assert(left);
2669
2670         if (read_dfd >= 0) {
2671                 /* If a directory fd is specified, then read the file directly from that dir. In this case we
2672                  * won't do AF_UNIX stuff (we simply don't want to recursively iterate down a tree of AF_UNIX
2673                  * IPC sockets). It's OK if a file vanishes here in the time we enumerate it and intend to
2674                  * open it. */
2675
2676                 if (!filename_is_valid(path)) /* safety check */
2677                         return -EINVAL;
2678
2679                 missing_ok = true;
2680                 source = path;
2681
2682         } else if (path_is_absolute(path)) {
2683                 /* If this is an absolute path, read the data directly from it, and support AF_UNIX
2684                  * sockets */
2685
2686                 if (!path_is_valid(path)) /* safety check */
2687                         return -EINVAL;
2688
2689                 flags |= READ_FULL_FILE_CONNECT_SOCKET;
2690
2691                 /* Pass some minimal info about the unit and the credential name we are looking to acquire
2692                  * via the source socket address in case we read off an AF_UNIX socket. */
2693                 if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, id) < 0)
2694                         return -ENOMEM;
2695
2696                 missing_ok = false;
2697                 source = path;
2698
2699         } else if (credential_name_valid(path)) {
2700                 /* If this is a relative path, take it as credential name relative to the credentials
2701                  * directory we received ourselves. We don't support the AF_UNIX stuff in this mode, since we
2702                  * are operating on a credential store, i.e. this is guaranteed to be regular files. */
2703
2704                 search_path = credential_search_path(params, encrypted);
2705                 if (!search_path)
2706                         return -ENOMEM;
2707
2708                 missing_ok = true;
2709         } else
2710                 source = NULL;
2711
2712         if (encrypted)
2713                 flags |= READ_FULL_FILE_UNBASE64;
2714
2715         maxsz = encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX;
2716
2717         if (search_path) {
2718                 STRV_FOREACH(d, search_path) {
2719                         _cleanup_free_ char *j = NULL;
2720
2721                         j = path_join(*d, path);
2722                         if (!j)
2723                                 return -ENOMEM;
2724
2725                         r = read_full_file_full(
2726                                         AT_FDCWD, j, /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
2727                                         UINT64_MAX,
2728                                         maxsz,
2729                                         flags,
2730                                         NULL,
2731                                         &data, &size);
2732                         if (r != -ENOENT)
2733                                 break;
2734                 }
2735         } else if (source)
2736                 r = read_full_file_full(
2737                                 read_dfd, source,
2738                                 UINT64_MAX,
2739                                 maxsz,
2740                                 flags,
2741                                 bindname,
2742                                 &data, &size);
2743         else
2744                 r = -ENOENT;
2745
2746         if (r == -ENOENT && (missing_ok || hashmap_contains(context->set_credentials, id))) {
2747                 /* Make a missing inherited credential non-fatal, let's just continue. After all apps
2748                  * will get clear errors if we don't pass such a missing credential on as they
2749                  * themselves will get ENOENT when trying to read them, which should not be much
2750                  * worse than when we handle the error here and make it fatal.
2751                  *
2752                  * Also, if the source file doesn't exist, but a fallback is set via SetCredentials=
2753                  * we are fine, too. */
2754                 log_debug_errno(r, "Couldn't read inherited credential '%s', skipping: %m", path);
2755                 return 0;
2756         }
2757         if (r < 0)
2758                 return log_debug_errno(r, "Failed to read credential '%s': %m", path);
2759
2760         if (encrypted) {
2761                 _cleanup_free_ void *plaintext = NULL;
2762                 size_t plaintext_size = 0;
2763
2764                 r = decrypt_credential_and_warn(id, now(CLOCK_REALTIME), NULL, NULL, data, size, &plaintext, &plaintext_size);
2765                 if (r < 0)
2766                         return r;
2767
2768                 free_and_replace(data, plaintext);
2769                 size = plaintext_size;
2770         }
2771
2772         add = strlen(id) + size;
2773         if (add > *left)
2774                 return -E2BIG;
2775
2776         r = write_credential(write_dfd, id, data, size, uid, ownership_ok);
2777         if (r < 0)
2778                 return log_debug_errno(r, "Failed to write credential '%s': %m", id);
2779
2780         *left -= add;
2781         return 0;
2782 }
2783
2784 struct load_cred_args {
2785         const ExecContext *context;
2786         const ExecParameters *params;
2787         bool encrypted;
2788         const char *unit;
2789         int dfd;
2790         uid_t uid;
2791         bool ownership_ok;
2792         uint64_t *left;
2793 };
2794
2795 static int load_cred_recurse_dir_cb(
2796                 RecurseDirEvent event,
2797                 const char *path,
2798                 int dir_fd,
2799                 int inode_fd,
2800                 const struct dirent *de,
2801                 const struct statx *sx,
2802                 void *userdata) {
2803
2804         struct load_cred_args *args = ASSERT_PTR(userdata);
2805         _cleanup_free_ char *sub_id = NULL;
2806         int r;
2807
2808         if (event != RECURSE_DIR_ENTRY)
2809                 return RECURSE_DIR_CONTINUE;
2810
2811         if (!IN_SET(de->d_type, DT_REG, DT_SOCK))
2812                 return RECURSE_DIR_CONTINUE;
2813
2814         sub_id = strreplace(path, "/", "_");
2815         if (!sub_id)
2816                 return -ENOMEM;
2817
2818         if (!credential_name_valid(sub_id))
2819                 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Credential would get ID %s, which is not valid, refusing", sub_id);
2820
2821         if (faccessat(args->dfd, sub_id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0) {
2822                 log_debug("Skipping credential with duplicated ID %s at %s", sub_id, path);
2823                 return RECURSE_DIR_CONTINUE;
2824         }
2825         if (errno != ENOENT)
2826                 return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sub_id);
2827
2828         r = load_credential(
2829                         args->context,
2830                         args->params,
2831                         sub_id,
2832                         de->d_name,
2833                         args->encrypted,
2834                         args->unit,
2835                         dir_fd,
2836                         args->dfd,
2837                         args->uid,
2838                         args->ownership_ok,
2839                         args->left);
2840         if (r < 0)
2841                 return r;
2842
2843         return RECURSE_DIR_CONTINUE;
2844 }
2845
2846 static int acquire_credentials(
2847                 const ExecContext *context,
2848                 const ExecParameters *params,
2849                 const char *unit,
2850                 const char *p,
2851                 uid_t uid,
2852                 bool ownership_ok) {
2853
2854         uint64_t left = CREDENTIALS_TOTAL_SIZE_MAX;
2855         _cleanup_close_ int dfd = -EBADF;
2856         ExecLoadCredential *lc;
2857         ExecSetCredential *sc;
2858         int r;
2859
2860         assert(context);
2861         assert(p);
2862
2863         dfd = open(p, O_DIRECTORY|O_CLOEXEC);
2864         if (dfd < 0)
2865                 return -errno;
2866
2867         /* First, load credentials off disk (or acquire via AF_UNIX socket) */
2868         HASHMAP_FOREACH(lc, context->load_credentials) {
2869                 _cleanup_close_ int sub_fd = -EBADF;
2870
2871                 /* If this is an absolute path, then try to open it as a directory. If that works, then we'll
2872                  * recurse into it. If it is an absolute path but it isn't a directory, then we'll open it as
2873                  * a regular file. Finally, if it's a relative path we will use it as a credential name to
2874                  * propagate a credential passed to us from further up. */
2875
2876                 if (path_is_absolute(lc->path)) {
2877                         sub_fd = open(lc->path, O_DIRECTORY|O_CLOEXEC|O_RDONLY);
2878                         if (sub_fd < 0 && !IN_SET(errno,
2879                                                   ENOTDIR,  /* Not a directory */
2880                                                   ENOENT))  /* Doesn't exist? */
2881                                 return log_debug_errno(errno, "Failed to open '%s': %m", lc->path);
2882                 }
2883
2884                 if (sub_fd < 0)
2885                         /* Regular file (incl. a credential passed in from higher up) */
2886                         r = load_credential(
2887                                         context,
2888                                         params,
2889                                         lc->id,
2890                                         lc->path,
2891                                         lc->encrypted,
2892                                         unit,
2893                                         AT_FDCWD,
2894                                         dfd,
2895                                         uid,
2896                                         ownership_ok,
2897                                         &left);
2898                 else
2899                         /* Directory */
2900                         r = recurse_dir(
2901                                         sub_fd,
2902                                         /* path= */ lc->id, /* recurse_dir() will suffix the subdir paths from here to the top-level id */
2903                                         /* statx_mask= */ 0,
2904                                         /* n_depth_max= */ UINT_MAX,
2905                                         RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT|RECURSE_DIR_ENSURE_TYPE,
2906                                         load_cred_recurse_dir_cb,
2907                                         &(struct load_cred_args) {
2908                                                 .context = context,
2909                                                 .params = params,
2910                                                 .encrypted = lc->encrypted,
2911                                                 .unit = unit,
2912                                                 .dfd = dfd,
2913                                                 .uid = uid,
2914                                                 .ownership_ok = ownership_ok,
2915                                                 .left = &left,
2916                                         });
2917                 if (r < 0)
2918                         return r;
2919         }
2920
2921         /* Second, we add in literally specified credentials. If the credentials already exist, we'll not add
2922          * them, so that they can act as a "default" if the same credential is specified multiple times. */
2923         HASHMAP_FOREACH(sc, context->set_credentials) {
2924                 _cleanup_(erase_and_freep) void *plaintext = NULL;
2925                 const char *data;
2926                 size_t size, add;
2927
2928                 /* Note that we check ahead of time here instead of relying on O_EXCL|O_CREAT later to return
2929                  * EEXIST if the credential already exists. That's because the TPM2-based decryption is kinda
2930                  * slow and involved, hence it's nice to be able to skip that if the credential already
2931                  * exists anyway. */
2932                 if (faccessat(dfd, sc->id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0)
2933                         continue;
2934                 if (errno != ENOENT)
2935                         return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sc->id);
2936
2937                 if (sc->encrypted) {
2938                         r = decrypt_credential_and_warn(sc->id, now(CLOCK_REALTIME), NULL, NULL, sc->data, sc->size, &plaintext, &size);
2939                         if (r < 0)
2940                                 return r;
2941
2942                         data = plaintext;
2943                 } else {
2944                         data = sc->data;
2945                         size = sc->size;
2946                 }
2947
2948                 add = strlen(sc->id) + size;
2949                 if (add > left)
2950                         return -E2BIG;
2951
2952                 r = write_credential(dfd, sc->id, data, size, uid, ownership_ok);
2953                 if (r < 0)
2954                         return r;
2955
2956                 left -= add;
2957         }
2958
2959         if (fchmod(dfd, 0500) < 0) /* Now take away the "w" bit */
2960                 return -errno;
2961
2962         /* After we created all keys with the right perms, also make sure the credential store as a whole is
2963          * accessible */
2964
2965         if (uid_is_valid(uid) && uid != getuid()) {
2966                 r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE);
2967                 if (r < 0) {
2968                         if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2969                                 return r;
2970
2971                         if (!ownership_ok)
2972                                 return r;
2973
2974                         if (fchown(dfd, uid, GID_INVALID) < 0)
2975                                 return -errno;
2976                 }
2977         }
2978
2979         return 0;
2980 }
2981
2982 static int setup_credentials_internal(
2983                 const ExecContext *context,
2984                 const ExecParameters *params,
2985                 const char *unit,
2986                 const char *final,        /* This is where the credential store shall eventually end up at */
2987                 const char *workspace,    /* This is where we can prepare it before moving it to the final place */
2988                 bool reuse_workspace,     /* Whether to reuse any existing workspace mount if it already is a mount */
2989                 bool must_mount,          /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
2990                 uid_t uid) {
2991
2992         int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
2993                                    * if we mounted something; false if we definitely can't mount anything */
2994         bool final_mounted;
2995         const char *where;
2996
2997         assert(context);
2998         assert(final);
2999         assert(workspace);
3000
3001         if (reuse_workspace) {
3002                 r = path_is_mount_point(workspace, NULL, 0);
3003                 if (r < 0)
3004                         return r;
3005                 if (r > 0)
3006                         workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse it, let's keep this in mind */
3007                 else
3008                         workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
3009         } else
3010                 workspace_mounted = -1; /* ditto */
3011
3012         r = path_is_mount_point(final, NULL, 0);
3013         if (r < 0)
3014                 return r;
3015         if (r > 0) {
3016                 /* If the final place already has something mounted, we use that. If the workspace also has
3017                  * something mounted we assume it's actually the same mount (but with MS_RDONLY
3018                  * different). */
3019                 final_mounted = true;
3020
3021                 if (workspace_mounted < 0) {
3022                         /* If the final place is mounted, but the workspace we isn't, then let's bind mount
3023                          * the final version to the workspace, and make it writable, so that we can make
3024                          * changes */
3025
3026                         r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
3027                         if (r < 0)
3028                                 return r;
3029
3030                         r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
3031                         if (r < 0)
3032                                 return r;
3033
3034                         workspace_mounted = true;
3035                 }
3036         } else
3037                 final_mounted = false;
3038
3039         if (workspace_mounted < 0) {
3040                 /* Nothing is mounted on the workspace yet, let's try to mount something now */
3041                 for (int try = 0;; try++) {
3042
3043                         if (try == 0) {
3044                                 /* Try "ramfs" first, since it's not swap backed */
3045                                 r = mount_nofollow_verbose(LOG_DEBUG, "ramfs", workspace, "ramfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, "mode=0700");
3046                                 if (r >= 0) {
3047                                         workspace_mounted = true;
3048                                         break;
3049                                 }
3050
3051                         } else if (try == 1) {
3052                                 _cleanup_free_ char *opts = NULL;
3053
3054                                 if (asprintf(&opts, "mode=0700,nr_inodes=1024,size=%zu", (size_t) CREDENTIALS_TOTAL_SIZE_MAX) < 0)
3055                                         return -ENOMEM;
3056
3057                                 /* Fall back to "tmpfs" otherwise */
3058                                 r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", workspace, "tmpfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, opts);
3059                                 if (r >= 0) {
3060                                         workspace_mounted = true;
3061                                         break;
3062                                 }
3063
3064                         } else {
3065                                 /* If that didn't work, try to make a bind mount from the final to the workspace, so that we can make it writable there. */
3066                                 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
3067                                 if (r < 0) {
3068                                         if (!ERRNO_IS_PRIVILEGE(r)) /* Propagate anything that isn't a permission problem */
3069                                                 return r;
3070
3071                                         if (must_mount) /* If we it's not OK to use the plain directory
3072                                                          * fallback, propagate all errors too */
3073                                                 return r;
3074
3075                                         /* If we lack privileges to bind mount stuff, then let's gracefully
3076                                          * proceed for compat with container envs, and just use the final dir
3077                                          * as is. */
3078
3079                                         workspace_mounted = false;
3080                                         break;
3081                                 }
3082
3083                                 /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
3084                                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
3085                                 if (r < 0)
3086                                         return r;
3087
3088                                 workspace_mounted = true;
3089                                 break;
3090                         }
3091                 }
3092         }
3093
3094         assert(!must_mount || workspace_mounted > 0);
3095         where = workspace_mounted ? workspace : final;
3096
3097         (void) label_fix_full(AT_FDCWD, where, final, 0);
3098
3099         r = acquire_credentials(context, params, unit, where, uid, workspace_mounted);
3100         if (r < 0)
3101                 return r;
3102
3103         if (workspace_mounted) {
3104                 /* Make workspace read-only now, so that any bind mount we make from it defaults to read-only too */
3105                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
3106                 if (r < 0)
3107                         return r;
3108
3109                 /* And mount it to the final place, read-only */
3110                 if (final_mounted)
3111                         r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
3112                 else
3113                         r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
3114                 if (r < 0)
3115                         return r;
3116         } else {
3117                 _cleanup_free_ char *parent = NULL;
3118
3119                 /* If we do not have our own mount put used the plain directory fallback, then we need to
3120                  * open access to the top-level credential directory and the per-service directory now */
3121
3122                 r = path_extract_directory(final, &parent);
3123                 if (r < 0)
3124                         return r;
3125                 if (chmod(parent, 0755) < 0)
3126                         return -errno;
3127         }
3128
3129         return 0;
3130 }
3131
3132 static int setup_credentials(
3133                 const ExecContext *context,
3134                 const ExecParameters *params,
3135                 const char *unit,
3136                 uid_t uid) {
3137
3138         _cleanup_free_ char *p = NULL, *q = NULL;
3139         int r;
3140
3141         assert(context);
3142         assert(params);
3143
3144         if (!exec_context_has_credentials(context))
3145                 return 0;
3146
3147         if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
3148                 return -EINVAL;
3149
3150         /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
3151          * and the subdir we mount over with a read-only file system readable by the service's user */
3152         q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials");
3153         if (!q)
3154                 return -ENOMEM;
3155
3156         r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */
3157         if (r < 0 && r != -EEXIST)
3158                 return r;
3159
3160         p = path_join(q, unit);
3161         if (!p)
3162                 return -ENOMEM;
3163
3164         r = mkdir_label(p, 0700); /* per-unit dir: private to user */
3165         if (r < 0 && r != -EEXIST)
3166                 return r;
3167
3168         r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
3169         if (r < 0) {
3170                 _cleanup_free_ char *t = NULL, *u = NULL;
3171
3172                 /* If this is not a privilege or support issue then propagate the error */
3173                 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
3174                         return r;
3175
3176                 /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
3177                  * it into place, so that users can't access half-initialized credential stores. */
3178                 t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
3179                 if (!t)
3180                         return -ENOMEM;
3181
3182                 /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
3183                  * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
3184                  * after it is fully set up */
3185                 u = path_join(t, unit);
3186                 if (!u)
3187                         return -ENOMEM;
3188
3189                 FOREACH_STRING(i, t, u) {
3190                         r = mkdir_label(i, 0700);
3191                         if (r < 0 && r != -EEXIST)
3192                                 return r;
3193                 }
3194
3195                 r = setup_credentials_internal(
3196                                 context,
3197                                 params,
3198                                 unit,
3199                                 p,       /* final mount point */
3200                                 u,       /* temporary workspace to overmount */
3201                                 true,    /* reuse the workspace if it is already a mount */
3202                                 false,   /* it's OK to fall back to a plain directory if we can't mount anything */
3203                                 uid);
3204
3205                 (void) rmdir(u); /* remove the workspace again if we can. */
3206
3207                 if (r < 0)
3208                         return r;
3209
3210         } else if (r == 0) {
3211
3212                 /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
3213                  * we can use the same directory for all cases, after turning off propagation. Question
3214                  * though is: where do we turn off propagation exactly, and where do we place the workspace
3215                  * directory? We need some place that is guaranteed to be a mount point in the host, and
3216                  * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
3217                  * since we ultimately want to move the resulting file system there, i.e. we need propagation
3218                  * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
3219                  * would be visible in the host mount table all the time, which we want to avoid. Hence, what
3220                  * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
3221                  * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
3222                  * propagation on the former, and then overmount the latter.
3223                  *
3224                  * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
3225                  * for this purpose, but there are few other candidates that work equally well for us, and
3226                  * given that the we do this in a privately namespaced short-lived single-threaded process
3227                  * that no one else sees this should be OK to do. */
3228
3229                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL); /* Turn off propagation from our namespace to host */
3230                 if (r < 0)
3231                         goto child_fail;
3232
3233                 r = setup_credentials_internal(
3234                                 context,
3235                                 params,
3236                                 unit,
3237                                 p,           /* final mount point */
3238                                 "/dev/shm",  /* temporary workspace to overmount */
3239                                 false,       /* do not reuse /dev/shm if it is already a mount, under no circumstances */
3240                                 true,        /* insist that something is mounted, do not allow fallback to plain directory */
3241                                 uid);
3242                 if (r < 0)
3243                         goto child_fail;
3244
3245                 _exit(EXIT_SUCCESS);
3246
3247         child_fail:
3248                 _exit(EXIT_FAILURE);
3249         }
3250
3251         return 0;
3252 }
3253
3254 #if ENABLE_SMACK
3255 static int setup_smack(
3256                 const Manager *manager,
3257                 const ExecContext *context,
3258                 int executable_fd) {
3259         int r;
3260
3261         assert(context);
3262         assert(executable_fd >= 0);
3263
3264         if (context->smack_process_label) {
3265                 r = mac_smack_apply_pid(0, context->smack_process_label);
3266                 if (r < 0)
3267                         return r;
3268         } else if (manager->default_smack_process_label) {
3269                 _cleanup_free_ char *exec_label = NULL;
3270
3271                 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
3272                 if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
3273                         return r;
3274
3275                 r = mac_smack_apply_pid(0, exec_label ? : manager->default_smack_process_label);
3276                 if (r < 0)
3277                         return r;
3278         }
3279
3280         return 0;
3281 }
3282 #endif
3283
3284 static int compile_bind_mounts(
3285                 const ExecContext *context,
3286                 const ExecParameters *params,
3287                 BindMount **ret_bind_mounts,
3288                 size_t *ret_n_bind_mounts,
3289                 char ***ret_empty_directories) {
3290
3291         _cleanup_strv_free_ char **empty_directories = NULL;
3292         BindMount *bind_mounts;
3293         size_t n, h = 0;
3294         int r;
3295
3296         assert(context);
3297         assert(params);
3298         assert(ret_bind_mounts);
3299         assert(ret_n_bind_mounts);
3300         assert(ret_empty_directories);
3301
3302         n = context->n_bind_mounts;
3303         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3304                 if (!params->prefix[t])
3305                         continue;
3306
3307                 for (size_t i = 0; i < context->directories[t].n_items; i++)
3308                         n += !context->directories[t].items[i].only_create;
3309         }
3310
3311         if (n <= 0) {
3312                 *ret_bind_mounts = NULL;
3313                 *ret_n_bind_mounts = 0;
3314                 *ret_empty_directories = NULL;
3315                 return 0;
3316         }
3317
3318         bind_mounts = new(BindMount, n);
3319         if (!bind_mounts)
3320                 return -ENOMEM;
3321
3322         for (size_t i = 0; i < context->n_bind_mounts; i++) {
3323                 BindMount *item = context->bind_mounts + i;
3324                 char *s, *d;
3325
3326                 s = strdup(item->source);
3327                 if (!s) {
3328                         r = -ENOMEM;
3329                         goto finish;
3330                 }
3331
3332                 d = strdup(item->destination);
3333                 if (!d) {
3334                         free(s);
3335                         r = -ENOMEM;
3336                         goto finish;
3337                 }
3338
3339                 bind_mounts[h++] = (BindMount) {
3340                         .source = s,
3341                         .destination = d,
3342                         .read_only = item->read_only,
3343                         .recursive = item->recursive,
3344                         .ignore_enoent = item->ignore_enoent,
3345                 };
3346         }
3347
3348         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3349                 if (!params->prefix[t])
3350                         continue;
3351
3352                 if (context->directories[t].n_items == 0)
3353                         continue;
3354
3355                 if (exec_directory_is_private(context, t) &&
3356                     !exec_context_with_rootfs(context)) {
3357                         char *private_root;
3358
3359                         /* So this is for a dynamic user, and we need to make sure the process can access its own
3360                          * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3361                          * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3362
3363                         private_root = path_join(params->prefix[t], "private");
3364                         if (!private_root) {
3365                                 r = -ENOMEM;
3366                                 goto finish;
3367                         }
3368
3369                         r = strv_consume(&empty_directories, private_root);
3370                         if (r < 0)
3371                                 goto finish;
3372                 }
3373
3374                 for (size_t i = 0; i < context->directories[t].n_items; i++) {
3375                         char *s, *d;
3376
3377                         /* When one of the parent directories is in the list, we cannot create the symlink
3378                          * for the child directory. See also the comments in setup_exec_directory(). */
3379                         if (context->directories[t].items[i].only_create)
3380                                 continue;
3381
3382                         if (exec_directory_is_private(context, t))
3383                                 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
3384                         else
3385                                 s = path_join(params->prefix[t], context->directories[t].items[i].path);
3386                         if (!s) {
3387                                 r = -ENOMEM;
3388                                 goto finish;
3389                         }
3390
3391                         if (exec_directory_is_private(context, t) &&
3392                             exec_context_with_rootfs(context))
3393                                 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3394                                  * directory is not created on the root directory. So, let's bind-mount the directory
3395                                  * on the 'non-private' place. */
3396                                 d = path_join(params->prefix[t], context->directories[t].items[i].path);
3397                         else
3398                                 d = strdup(s);
3399                         if (!d) {
3400                                 free(s);
3401                                 r = -ENOMEM;
3402                                 goto finish;
3403                         }
3404
3405                         bind_mounts[h++] = (BindMount) {
3406                                 .source = s,
3407                                 .destination = d,
3408                                 .read_only = false,
3409                                 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
3410                                 .recursive = true,
3411                                 .ignore_enoent = false,
3412                         };
3413                 }
3414         }
3415
3416         assert(h == n);
3417
3418         *ret_bind_mounts = bind_mounts;
3419         *ret_n_bind_mounts = n;
3420         *ret_empty_directories = TAKE_PTR(empty_directories);
3421
3422         return (int) n;
3423
3424 finish:
3425         bind_mount_free_many(bind_mounts, h);
3426         return r;
3427 }
3428
3429 /* ret_symlinks will contain a list of pairs src:dest that describes
3430  * the symlinks to create later on. For example, the symlinks needed
3431  * to safely give private directories to DynamicUser=1 users. */
3432 static int compile_symlinks(
3433                 const ExecContext *context,
3434                 const ExecParameters *params,
3435                 char ***ret_symlinks) {
3436
3437         _cleanup_strv_free_ char **symlinks = NULL;
3438         int r;
3439
3440         assert(context);
3441         assert(params);
3442         assert(ret_symlinks);
3443
3444         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3445                 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
3446                         _cleanup_free_ char *private_path = NULL, *path = NULL;
3447
3448                         STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
3449                                 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
3450
3451                                 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3452                                 dst_abs = path_join(params->prefix[dt], *symlink);
3453                                 if (!src_abs || !dst_abs)
3454                                         return -ENOMEM;
3455
3456                                 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
3457                                 if (r < 0)
3458                                         return r;
3459                         }
3460
3461                         if (!exec_directory_is_private(context, dt) ||
3462                             exec_context_with_rootfs(context) ||
3463                             context->directories[dt].items[i].only_create)
3464                                 continue;
3465
3466                         private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
3467                         if (!private_path)
3468                                 return -ENOMEM;
3469
3470                         path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3471                         if (!path)
3472                                 return -ENOMEM;
3473
3474                         r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
3475                         if (r < 0)
3476                                 return r;
3477                 }
3478         }
3479
3480         *ret_symlinks = TAKE_PTR(symlinks);
3481
3482         return 0;
3483 }
3484
3485 static bool insist_on_sandboxing(
3486                 const ExecContext *context,
3487                 const char *root_dir,
3488                 const char *root_image,
3489                 const BindMount *bind_mounts,
3490                 size_t n_bind_mounts) {
3491
3492         assert(context);
3493         assert(n_bind_mounts == 0 || bind_mounts);
3494
3495         /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
3496          * would alter the view on the file system beyond making things read-only or invisible, i.e. would
3497          * rearrange stuff in a way we cannot ignore gracefully. */
3498
3499         if (context->n_temporary_filesystems > 0)
3500                 return true;
3501
3502         if (root_dir || root_image)
3503                 return true;
3504
3505         if (context->n_mount_images > 0)
3506                 return true;
3507
3508         if (context->dynamic_user)
3509                 return true;
3510
3511         if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
3512                 return true;
3513
3514         /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3515          * essential. */
3516         for (size_t i = 0; i < n_bind_mounts; i++)
3517                 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
3518                         return true;
3519
3520         if (context->log_namespace)
3521                 return true;
3522
3523         return false;
3524 }
3525
3526 static int apply_mount_namespace(
3527                 const Unit *u,
3528                 ExecCommandFlags command_flags,
3529                 const ExecContext *context,
3530                 const ExecParameters *params,
3531                 const ExecRuntime *runtime,
3532                 char **error_path) {
3533
3534         _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL;
3535         const char *tmp_dir = NULL, *var_tmp_dir = NULL;
3536         const char *root_dir = NULL, *root_image = NULL;
3537         _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
3538                         *extension_dir = NULL;
3539         NamespaceInfo ns_info;
3540         bool needs_sandboxing;
3541         BindMount *bind_mounts = NULL;
3542         size_t n_bind_mounts = 0;
3543         int r;
3544
3545         assert(context);
3546
3547         if (params->flags & EXEC_APPLY_CHROOT) {
3548                 root_image = context->root_image;
3549
3550                 if (!root_image)
3551                         root_dir = context->root_directory;
3552         }
3553
3554         r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3555         if (r < 0)
3556                 return r;
3557
3558         /* Symlinks for exec dirs are set up after other mounts, before they are made read-only. */
3559         r = compile_symlinks(context, params, &symlinks);
3560         if (r < 0)
3561                 goto finalize;
3562
3563         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3564         if (needs_sandboxing) {
3565                 /* The runtime struct only contains the parent of the private /tmp,
3566                  * which is non-accessible to world users. Inside of it there's a /tmp
3567                  * that is sticky, and that's the one we want to use here.
3568                  * This does not apply when we are using /run/systemd/empty as fallback. */
3569
3570                 if (context->private_tmp && runtime) {
3571                         if (streq_ptr(runtime->tmp_dir, RUN_SYSTEMD_EMPTY))
3572                                 tmp_dir = runtime->tmp_dir;
3573                         else if (runtime->tmp_dir)
3574                                 tmp_dir = strjoina(runtime->tmp_dir, "/tmp");
3575
3576                         if (streq_ptr(runtime->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3577                                 var_tmp_dir = runtime->var_tmp_dir;
3578                         else if (runtime->var_tmp_dir)
3579                                 var_tmp_dir = strjoina(runtime->var_tmp_dir, "/tmp");
3580                 }
3581
3582                 ns_info = (NamespaceInfo) {
3583                         .ignore_protect_paths = false,
3584                         .private_dev = context->private_devices,
3585                         .protect_control_groups = context->protect_control_groups,
3586                         .protect_kernel_tunables = context->protect_kernel_tunables,
3587                         .protect_kernel_modules = context->protect_kernel_modules,
3588                         .protect_kernel_logs = context->protect_kernel_logs,
3589                         .protect_hostname = context->protect_hostname,
3590                         .mount_apivfs = exec_context_get_effective_mount_apivfs(context),
3591                         .private_mounts = context->private_mounts,
3592                         .protect_home = context->protect_home,
3593                         .protect_system = context->protect_system,
3594                         .protect_proc = context->protect_proc,
3595                         .proc_subset = context->proc_subset,
3596                         .private_ipc = context->private_ipc || context->ipc_namespace_path,
3597                         /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
3598                         .mount_nosuid = context->no_new_privileges && !mac_selinux_use(),
3599                 };
3600         } else if (!context->dynamic_user && root_dir)
3601                 /*
3602                  * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
3603                  * sandbox info, otherwise enforce it, don't ignore protected paths and
3604                  * fail if we are enable to apply the sandbox inside the mount namespace.
3605                  */
3606                 ns_info = (NamespaceInfo) {
3607                         .ignore_protect_paths = true,
3608                 };
3609         else
3610                 ns_info = (NamespaceInfo) {};
3611
3612         if (context->mount_flags == MS_SHARED)
3613                 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3614
3615         if (exec_context_has_credentials(context) &&
3616             params->prefix[EXEC_DIRECTORY_RUNTIME] &&
3617             FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
3618                 creds_path = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials", u->id);
3619                 if (!creds_path) {
3620                         r = -ENOMEM;
3621                         goto finalize;
3622                 }
3623         }
3624
3625         if (MANAGER_IS_SYSTEM(u->manager)) {
3626                 propagate_dir = path_join("/run/systemd/propagate/", u->id);
3627                 if (!propagate_dir) {
3628                         r = -ENOMEM;
3629                         goto finalize;
3630                 }
3631
3632                 incoming_dir = strdup("/run/systemd/incoming");
3633                 if (!incoming_dir) {
3634                         r = -ENOMEM;
3635                         goto finalize;
3636                 }
3637
3638                 extension_dir = strdup("/run/systemd/unit-extensions");
3639                 if (!extension_dir) {
3640                         r = -ENOMEM;
3641                         goto finalize;
3642                 }
3643         } else
3644                 if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0) {
3645                         r = -ENOMEM;
3646                         goto finalize;
3647                 }
3648
3649         r = setup_namespace(root_dir, root_image, context->root_image_options,
3650                             &ns_info, context->read_write_paths,
3651                             needs_sandboxing ? context->read_only_paths : NULL,
3652                             needs_sandboxing ? context->inaccessible_paths : NULL,
3653                             needs_sandboxing ? context->exec_paths : NULL,
3654                             needs_sandboxing ? context->no_exec_paths : NULL,
3655                             empty_directories,
3656                             symlinks,
3657                             bind_mounts,
3658                             n_bind_mounts,
3659                             context->temporary_filesystems,
3660                             context->n_temporary_filesystems,
3661                             context->mount_images,
3662                             context->n_mount_images,
3663                             tmp_dir,
3664                             var_tmp_dir,
3665                             creds_path,
3666                             context->log_namespace,
3667                             context->mount_flags,
3668                             context->root_hash, context->root_hash_size, context->root_hash_path,
3669                             context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3670                             context->root_verity,
3671                             context->extension_images,
3672                             context->n_extension_images,
3673                             context->extension_directories,
3674                             propagate_dir,
3675                             incoming_dir,
3676                             extension_dir,
3677                             root_dir || root_image ? params->notify_socket : NULL,
3678                             error_path);
3679
3680         /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
3681          * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
3682          * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3683          * completely different execution environment. */
3684         if (r == -ENOANO) {
3685                 if (insist_on_sandboxing(
3686                                     context,
3687                                     root_dir, root_image,
3688                                     bind_mounts,
3689                                     n_bind_mounts)) {
3690                         log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
3691                                        "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3692                                        n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
3693
3694                         r = -EOPNOTSUPP;
3695                 } else {
3696                         log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
3697                         r = 0;
3698                 }
3699         }
3700
3701 finalize:
3702         bind_mount_free_many(bind_mounts, n_bind_mounts);
3703         return r;
3704 }
3705
3706 static int apply_working_directory(
3707                 const ExecContext *context,
3708                 const ExecParameters *params,
3709                 const char *home,
3710                 int *exit_status) {
3711
3712         const char *d, *wd;
3713
3714         assert(context);
3715         assert(exit_status);
3716
3717         if (context->working_directory_home) {
3718
3719                 if (!home) {
3720                         *exit_status = EXIT_CHDIR;
3721                         return -ENXIO;
3722                 }
3723
3724                 wd = home;
3725
3726         } else
3727                 wd = empty_to_root(context->working_directory);
3728
3729         if (params->flags & EXEC_APPLY_CHROOT)
3730                 d = wd;
3731         else
3732                 d = prefix_roota(context->root_directory, wd);
3733
3734         if (chdir(d) < 0 && !context->working_directory_missing_ok) {
3735                 *exit_status = EXIT_CHDIR;
3736                 return -errno;
3737         }
3738
3739         return 0;
3740 }
3741
3742 static int apply_root_directory(
3743                 const ExecContext *context,
3744                 const ExecParameters *params,
3745                 const bool needs_mount_ns,
3746                 int *exit_status) {
3747
3748         assert(context);
3749         assert(exit_status);
3750
3751         if (params->flags & EXEC_APPLY_CHROOT)
3752                 if (!needs_mount_ns && context->root_directory)
3753                         if (chroot(context->root_directory) < 0) {
3754                                 *exit_status = EXIT_CHROOT;
3755                                 return -errno;
3756                         }
3757
3758         return 0;
3759 }
3760
3761 static int setup_keyring(
3762                 const Unit *u,
3763                 const ExecContext *context,
3764                 const ExecParameters *p,
3765                 uid_t uid, gid_t gid) {
3766
3767         key_serial_t keyring;
3768         int r = 0;
3769         uid_t saved_uid;
3770         gid_t saved_gid;
3771
3772         assert(u);
3773         assert(context);
3774         assert(p);
3775
3776         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3777          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3778          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3779          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3780          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3781          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3782
3783         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3784                 return 0;
3785
3786         /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3787          * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3788          * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3789          * & group is just as nasty as acquiring a reference to the user keyring. */
3790
3791         saved_uid = getuid();
3792         saved_gid = getgid();
3793
3794         if (gid_is_valid(gid) && gid != saved_gid) {
3795                 if (setregid(gid, -1) < 0)
3796                         return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
3797         }
3798
3799         if (uid_is_valid(uid) && uid != saved_uid) {
3800                 if (setreuid(uid, -1) < 0) {
3801                         r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
3802                         goto out;
3803                 }
3804         }
3805
3806         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3807         if (keyring == -1) {
3808                 if (errno == ENOSYS)
3809                         log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
3810                 else if (ERRNO_IS_PRIVILEGE(errno))
3811                         log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
3812                 else if (errno == EDQUOT)
3813                         log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
3814                 else
3815                         r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
3816
3817                 goto out;
3818         }
3819
3820         /* When requested link the user keyring into the session keyring. */
3821         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3822
3823                 if (keyctl(KEYCTL_LINK,
3824                            KEY_SPEC_USER_KEYRING,
3825                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3826                         r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
3827                         goto out;
3828                 }
3829         }
3830
3831         /* Restore uid/gid back */
3832         if (uid_is_valid(uid) && uid != saved_uid) {
3833                 if (setreuid(saved_uid, -1) < 0) {
3834                         r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
3835                         goto out;
3836                 }
3837         }
3838
3839         if (gid_is_valid(gid) && gid != saved_gid) {
3840                 if (setregid(saved_gid, -1) < 0)
3841                         return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
3842         }
3843
3844         /* Populate they keyring with the invocation ID by default, as original saved_uid. */
3845         if (!sd_id128_is_null(u->invocation_id)) {
3846                 key_serial_t key;
3847
3848                 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
3849                 if (key == -1)
3850                         log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
3851                 else {
3852                         if (keyctl(KEYCTL_SETPERM, key,
3853                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3854                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
3855                                 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
3856                 }
3857         }
3858
3859 out:
3860         /* Revert back uid & gid for the last time, and exit */
3861         /* no extra logging, as only the first already reported error matters */
3862         if (getuid() != saved_uid)
3863                 (void) setreuid(saved_uid, -1);
3864
3865         if (getgid() != saved_gid)
3866                 (void) setregid(saved_gid, -1);
3867
3868         return r;
3869 }
3870
3871 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
3872         assert(array);
3873         assert(n);
3874         assert(pair);
3875
3876         if (pair[0] >= 0)
3877                 array[(*n)++] = pair[0];
3878         if (pair[1] >= 0)
3879                 array[(*n)++] = pair[1];
3880 }
3881
3882 static int close_remaining_fds(
3883                 const ExecParameters *params,
3884                 const ExecRuntime *runtime,
3885                 const DynamicCreds *dcreds,
3886                 int user_lookup_fd,
3887                 int socket_fd,
3888                 const int *fds, size_t n_fds) {
3889
3890         size_t n_dont_close = 0;
3891         int dont_close[n_fds + 12];
3892
3893         assert(params);
3894
3895         if (params->stdin_fd >= 0)
3896                 dont_close[n_dont_close++] = params->stdin_fd;
3897         if (params->stdout_fd >= 0)
3898                 dont_close[n_dont_close++] = params->stdout_fd;
3899         if (params->stderr_fd >= 0)
3900                 dont_close[n_dont_close++] = params->stderr_fd;
3901
3902         if (socket_fd >= 0)
3903                 dont_close[n_dont_close++] = socket_fd;
3904         if (n_fds > 0) {
3905                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3906                 n_dont_close += n_fds;
3907         }
3908
3909         if (runtime) {
3910                 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
3911                 append_socket_pair(dont_close, &n_dont_close, runtime->ipcns_storage_socket);
3912         }
3913
3914         if (dcreds) {
3915                 if (dcreds->user)
3916                         append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
3917                 if (dcreds->group)
3918                         append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
3919         }
3920
3921         if (user_lookup_fd >= 0)
3922                 dont_close[n_dont_close++] = user_lookup_fd;
3923
3924         return close_all_fds(dont_close, n_dont_close);
3925 }
3926
3927 static int send_user_lookup(
3928                 Unit *unit,
3929                 int user_lookup_fd,
3930                 uid_t uid,
3931                 gid_t gid) {
3932
3933         assert(unit);
3934
3935         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3936          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3937          * specified. */
3938
3939         if (user_lookup_fd < 0)
3940                 return 0;
3941
3942         if (!uid_is_valid(uid) && !gid_is_valid(gid))
3943                 return 0;
3944
3945         if (writev(user_lookup_fd,
3946                (struct iovec[]) {
3947                            IOVEC_INIT(&uid, sizeof(uid)),
3948                            IOVEC_INIT(&gid, sizeof(gid)),
3949                            IOVEC_INIT_STRING(unit->id) }, 3) < 0)
3950                 return -errno;
3951
3952         return 0;
3953 }
3954
3955 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3956         int r;
3957
3958         assert(c);
3959         assert(home);
3960         assert(buf);
3961
3962         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3963
3964         if (*home)
3965                 return 0;
3966
3967         if (!c->working_directory_home)
3968                 return 0;
3969
3970         r = get_home_dir(buf);
3971         if (r < 0)
3972                 return r;
3973
3974         *home = *buf;
3975         return 1;
3976 }
3977
3978 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3979         _cleanup_strv_free_ char ** list = NULL;
3980         int r;
3981
3982         assert(c);
3983         assert(p);
3984         assert(ret);
3985
3986         assert(c->dynamic_user);
3987
3988         /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3989          * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3990          * directories. */
3991
3992         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3993                 if (t == EXEC_DIRECTORY_CONFIGURATION)
3994                         continue;
3995
3996                 if (!p->prefix[t])
3997                         continue;
3998
3999                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
4000                         char *e;
4001
4002                         if (exec_directory_is_private(c, t))
4003                                 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
4004                         else
4005                                 e = path_join(p->prefix[t], c->directories[t].items[i].path);
4006                         if (!e)
4007                                 return -ENOMEM;
4008
4009                         r = strv_consume(&list, e);
4010                         if (r < 0)
4011                                 return r;
4012                 }
4013         }
4014
4015         *ret = TAKE_PTR(list);
4016
4017         return 0;
4018 }
4019
4020 static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
4021         bool using_subcgroup;
4022         char *p;
4023
4024         assert(params);
4025         assert(ret);
4026
4027         if (!params->cgroup_path)
4028                 return -EINVAL;
4029
4030         /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
4031          * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
4032          * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
4033          * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
4034          * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
4035          * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
4036          * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
4037          * flag, which is only passed for the former statements, not for the latter. */
4038
4039         using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
4040         if (using_subcgroup)
4041                 p = path_join(params->cgroup_path, ".control");
4042         else
4043                 p = strdup(params->cgroup_path);
4044         if (!p)
4045                 return -ENOMEM;
4046
4047         *ret = p;
4048         return using_subcgroup;
4049 }
4050
4051 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
4052         _cleanup_(cpu_set_reset) CPUSet s = {};
4053         int r;
4054
4055         assert(c);
4056         assert(ret);
4057
4058         if (!c->numa_policy.nodes.set) {
4059                 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
4060                 return 0;
4061         }
4062
4063         r = numa_to_cpu_set(&c->numa_policy, &s);
4064         if (r < 0)
4065                 return r;
4066
4067         cpu_set_reset(ret);
4068
4069         return cpu_set_add_all(ret, &s);
4070 }
4071
4072 bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
4073         assert(c);
4074
4075         return c->cpu_affinity_from_numa;
4076 }
4077
4078 static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
4079         int r;
4080
4081         assert(fds);
4082         assert(n_fds);
4083         assert(*n_fds < fds_size);
4084         assert(ret_fd);
4085
4086         if (fd < 0) {
4087                 *ret_fd = -EBADF;
4088                 return 0;
4089         }
4090
4091         if (fd < 3 + (int) *n_fds) {
4092                 /* Let's move the fd up, so that it's outside of the fd range we will use to store
4093                  * the fds we pass to the process (or which are closed only during execve). */
4094
4095                 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
4096                 if (r < 0)
4097                         return -errno;
4098
4099                 close_and_replace(fd, r);
4100         }
4101
4102         *ret_fd = fds[*n_fds] = fd;
4103         (*n_fds) ++;
4104         return 1;
4105 }
4106
4107 static int exec_child(
4108                 Unit *unit,
4109                 const ExecCommand *command,
4110                 const ExecContext *context,
4111                 const ExecParameters *params,
4112                 ExecRuntime *runtime,
4113                 DynamicCreds *dcreds,
4114                 int socket_fd,
4115                 const int named_iofds[static 3],
4116                 int *fds,
4117                 size_t n_socket_fds,
4118                 size_t n_storage_fds,
4119                 char **files_env,
4120                 int user_lookup_fd,
4121                 int *exit_status) {
4122
4123         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
4124         int r, ngids = 0, exec_fd;
4125         _cleanup_free_ gid_t *supplementary_gids = NULL;
4126         const char *username = NULL, *groupname = NULL;
4127         _cleanup_free_ char *home_buffer = NULL;
4128         const char *home = NULL, *shell = NULL;
4129         char **final_argv = NULL;
4130         dev_t journal_stream_dev = 0;
4131         ino_t journal_stream_ino = 0;
4132         bool userns_set_up = false;
4133         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
4134                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
4135                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
4136                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
4137 #if HAVE_SELINUX
4138         _cleanup_free_ char *mac_selinux_context_net = NULL;
4139         bool use_selinux = false;
4140 #endif
4141 #if ENABLE_SMACK
4142         bool use_smack = false;
4143 #endif
4144 #if HAVE_APPARMOR
4145         bool use_apparmor = false;
4146 #endif
4147         uid_t saved_uid = getuid();
4148         gid_t saved_gid = getgid();
4149         uid_t uid = UID_INVALID;
4150         gid_t gid = GID_INVALID;
4151         size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
4152                n_keep_fds; /* total number of fds not to close */
4153         int secure_bits;
4154         _cleanup_free_ gid_t *gids_after_pam = NULL;
4155         int ngids_after_pam = 0;
4156
4157         assert(unit);
4158         assert(command);
4159         assert(context);
4160         assert(params);
4161         assert(exit_status);
4162
4163         /* Explicitly test for CVE-2021-4034 inspired invocations */
4164         assert(command->path);
4165         assert(!strv_isempty(command->argv));
4166
4167         rename_process_from_path(command->path);
4168
4169         /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4170          * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4171          * both of which will be demoted to SIG_DFL. */
4172         (void) default_signals(SIGNALS_CRASH_HANDLER,
4173                                SIGNALS_IGNORE);
4174
4175         if (context->ignore_sigpipe)
4176                 (void) ignore_signals(SIGPIPE);
4177
4178         r = reset_signal_mask();
4179         if (r < 0) {
4180                 *exit_status = EXIT_SIGNAL_MASK;
4181                 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
4182         }
4183
4184         if (params->idle_pipe)
4185                 do_idle_pipe_dance(params->idle_pipe);
4186
4187         /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4188          * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4189          * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4190          * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
4191
4192         log_forget_fds();
4193         log_set_open_when_needed(true);
4194
4195         /* In case anything used libc syslog(), close this here, too */
4196         closelog();
4197
4198         int keep_fds[n_fds + 3];
4199         memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
4200         n_keep_fds = n_fds;
4201
4202         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
4203         if (r < 0) {
4204                 *exit_status = EXIT_FDS;
4205                 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4206         }
4207
4208 #if HAVE_LIBBPF
4209         if (unit->manager->restrict_fs) {
4210                 int bpf_map_fd = lsm_bpf_map_restrict_fs_fd(unit);
4211                 if (bpf_map_fd < 0) {
4212                         *exit_status = EXIT_FDS;
4213                         return log_unit_error_errno(unit, bpf_map_fd, "Failed to get restrict filesystems BPF map fd: %m");
4214                 }
4215
4216                 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, bpf_map_fd, &bpf_map_fd);
4217                 if (r < 0) {
4218                         *exit_status = EXIT_FDS;
4219                         return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4220                 }
4221         }
4222 #endif
4223
4224         r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
4225         if (r < 0) {
4226                 *exit_status = EXIT_FDS;
4227                 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
4228         }
4229
4230         if (!context->same_pgrp &&
4231             setsid() < 0) {
4232                 *exit_status = EXIT_SETSID;
4233                 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
4234         }
4235
4236         exec_context_tty_reset(context, params);
4237
4238         if (unit_shall_confirm_spawn(unit)) {
4239                 _cleanup_free_ char *cmdline = NULL;
4240
4241                 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
4242                 if (!cmdline) {
4243                         *exit_status = EXIT_MEMORY;
4244                         return log_oom();
4245                 }
4246
4247                 r = ask_for_confirmation(context, params->confirm_spawn, unit, cmdline);
4248                 if (r != CONFIRM_EXECUTE) {
4249                         if (r == CONFIRM_PRETEND_SUCCESS) {
4250                                 *exit_status = EXIT_SUCCESS;
4251                                 return 0;
4252                         }
4253                         *exit_status = EXIT_CONFIRM;
4254                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
4255                                                     "Execution cancelled by the user");
4256                 }
4257         }
4258
4259         /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4260          * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4261          * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4262          * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4263          * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4264         if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
4265             setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
4266                 *exit_status = EXIT_MEMORY;
4267                 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4268         }
4269
4270         if (context->dynamic_user && dcreds) {
4271                 _cleanup_strv_free_ char **suggested_paths = NULL;
4272
4273                 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
4274                  * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
4275                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4276                         *exit_status = EXIT_USER;
4277                         return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4278                 }
4279
4280                 r = compile_suggested_paths(context, params, &suggested_paths);
4281                 if (r < 0) {
4282                         *exit_status = EXIT_MEMORY;
4283                         return log_oom();
4284                 }
4285
4286                 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
4287                 if (r < 0) {
4288                         *exit_status = EXIT_USER;
4289                         if (r == -EILSEQ)
4290                                 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4291                                                             "Failed to update dynamic user credentials: User or group with specified name already exists.");
4292                         return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
4293                 }
4294
4295                 if (!uid_is_valid(uid)) {
4296                         *exit_status = EXIT_USER;
4297                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
4298                 }
4299
4300                 if (!gid_is_valid(gid)) {
4301                         *exit_status = EXIT_USER;
4302                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
4303                 }
4304
4305                 if (dcreds->user)
4306                         username = dcreds->user->name;
4307
4308         } else {
4309                 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
4310                 if (r < 0) {
4311                         *exit_status = EXIT_USER;
4312                         return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
4313                 }
4314
4315                 r = get_fixed_group(context, &groupname, &gid);
4316                 if (r < 0) {
4317                         *exit_status = EXIT_GROUP;
4318                         return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
4319                 }
4320         }
4321
4322         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4323         r = get_supplementary_groups(context, username, groupname, gid,
4324                                      &supplementary_gids, &ngids);
4325         if (r < 0) {
4326                 *exit_status = EXIT_GROUP;
4327                 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
4328         }
4329
4330         r = send_user_lookup(unit, user_lookup_fd, uid, gid);
4331         if (r < 0) {
4332                 *exit_status = EXIT_USER;
4333                 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
4334         }
4335
4336         user_lookup_fd = safe_close(user_lookup_fd);
4337
4338         r = acquire_home(context, uid, &home, &home_buffer);
4339         if (r < 0) {
4340                 *exit_status = EXIT_CHDIR;
4341                 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
4342         }
4343
4344         /* If a socket is connected to STDIN/STDOUT/STDERR, we
4345          * must sure to drop O_NONBLOCK */
4346         if (socket_fd >= 0)
4347                 (void) fd_nonblock(socket_fd, false);
4348
4349         /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4350          * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4351         if (params->cgroup_path) {
4352                 _cleanup_free_ char *p = NULL;
4353
4354                 r = exec_parameters_get_cgroup_path(params, &p);
4355                 if (r < 0) {
4356                         *exit_status = EXIT_CGROUP;
4357                         return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
4358                 }
4359
4360                 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
4361                 if (r == -EUCLEAN) {
4362                         *exit_status = EXIT_CGROUP;
4363                         return log_unit_error_errno(unit, r, "Failed to attach process to cgroup %s "
4364                                                     "because the cgroup or one of its parents or "
4365                                                     "siblings is in the threaded mode: %m", p);
4366                 }
4367                 if (r < 0) {
4368                         *exit_status = EXIT_CGROUP;
4369                         return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
4370                 }
4371         }
4372
4373         if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
4374                 r = open_shareable_ns_path(runtime->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
4375                 if (r < 0) {
4376                         *exit_status = EXIT_NETWORK;
4377                         return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
4378                 }
4379         }
4380
4381         if (context->ipc_namespace_path && runtime && runtime->ipcns_storage_socket[0] >= 0) {
4382                 r = open_shareable_ns_path(runtime->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
4383                 if (r < 0) {
4384                         *exit_status = EXIT_NAMESPACE;
4385                         return log_unit_error_errno(unit, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
4386                 }
4387         }
4388
4389         r = setup_input(context, params, socket_fd, named_iofds);
4390         if (r < 0) {
4391                 *exit_status = EXIT_STDIN;
4392                 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
4393         }
4394
4395         r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4396         if (r < 0) {
4397                 *exit_status = EXIT_STDOUT;
4398                 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
4399         }
4400
4401         r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4402         if (r < 0) {
4403                 *exit_status = EXIT_STDERR;
4404                 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
4405         }
4406
4407         if (context->oom_score_adjust_set) {
4408                 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
4409                  * prohibit write access to this file, and we shouldn't trip up over that. */
4410                 r = set_oom_score_adjust(context->oom_score_adjust);
4411                 if (ERRNO_IS_PRIVILEGE(r))
4412                         log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
4413                 else if (r < 0) {
4414                         *exit_status = EXIT_OOM_ADJUST;
4415                         return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
4416                 }
4417         }
4418
4419         if (context->coredump_filter_set) {
4420                 r = set_coredump_filter(context->coredump_filter);
4421                 if (ERRNO_IS_PRIVILEGE(r))
4422                         log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
4423                 else if (r < 0)
4424                         return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
4425         }
4426
4427         if (context->nice_set) {
4428                 r = setpriority_closest(context->nice);
4429                 if (r < 0)
4430                         return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
4431         }
4432
4433         if (context->cpu_sched_set) {
4434                 struct sched_param param = {
4435                         .sched_priority = context->cpu_sched_priority,
4436                 };
4437
4438                 r = sched_setscheduler(0,
4439                                        context->cpu_sched_policy |
4440                                        (context->cpu_sched_reset_on_fork ?
4441                                         SCHED_RESET_ON_FORK : 0),
4442                                        &param);
4443                 if (r < 0) {
4444                         *exit_status = EXIT_SETSCHEDULER;
4445                         return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
4446                 }
4447         }
4448
4449         if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4450                 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4451                 const CPUSet *cpu_set;
4452
4453                 if (context->cpu_affinity_from_numa) {
4454                         r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4455                         if (r < 0) {
4456                                 *exit_status = EXIT_CPUAFFINITY;
4457                                 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4458                         }
4459
4460                         cpu_set = &converted_cpu_set;
4461                 } else
4462                         cpu_set = &context->cpu_set;
4463
4464                 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
4465                         *exit_status = EXIT_CPUAFFINITY;
4466                         return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
4467                 }
4468         }
4469
4470         if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4471                 r = apply_numa_policy(&context->numa_policy);
4472                 if (r == -EOPNOTSUPP)
4473                         log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
4474                 else if (r < 0) {
4475                         *exit_status = EXIT_NUMA_POLICY;
4476                         return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
4477                 }
4478         }
4479
4480         if (context->ioprio_set)
4481                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
4482                         *exit_status = EXIT_IOPRIO;
4483                         return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
4484                 }
4485
4486         if (context->timer_slack_nsec != NSEC_INFINITY)
4487                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
4488                         *exit_status = EXIT_TIMERSLACK;
4489                         return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
4490                 }
4491
4492         if (context->personality != PERSONALITY_INVALID) {
4493                 r = safe_personality(context->personality);
4494                 if (r < 0) {
4495                         *exit_status = EXIT_PERSONALITY;
4496                         return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
4497                 }
4498         }
4499
4500         if (context->utmp_id) {
4501                 const char *line = context->tty_path ?
4502                         (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
4503                         NULL;
4504                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
4505                                       line,
4506                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
4507                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4508                                       USER_PROCESS,
4509                                       username);
4510         }
4511
4512         if (uid_is_valid(uid)) {
4513                 r = chown_terminal(STDIN_FILENO, uid);
4514                 if (r < 0) {
4515                         *exit_status = EXIT_STDIN;
4516                         return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
4517                 }
4518         }
4519
4520         /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
4521          * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4522          * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
4523          * touch a single hierarchy too. */
4524         if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
4525                 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
4526                 if (r < 0) {
4527                         *exit_status = EXIT_CGROUP;
4528                         return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
4529                 }
4530         }
4531
4532         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
4533
4534         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4535                 r = setup_exec_directory(context, params, uid, gid, dt, needs_mount_namespace, exit_status);
4536                 if (r < 0)
4537                         return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
4538         }
4539
4540         if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4541                 r = setup_credentials(context, params, unit->id, uid);
4542                 if (r < 0) {
4543                         *exit_status = EXIT_CREDENTIALS;
4544                         return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
4545                 }
4546         }
4547
4548         r = build_environment(
4549                         unit,
4550                         context,
4551                         params,
4552                         n_fds,
4553                         home,
4554                         username,
4555                         shell,
4556                         journal_stream_dev,
4557                         journal_stream_ino,
4558                         &our_env);
4559         if (r < 0) {
4560                 *exit_status = EXIT_MEMORY;
4561                 return log_oom();
4562         }
4563
4564         r = build_pass_environment(context, &pass_env);
4565         if (r < 0) {
4566                 *exit_status = EXIT_MEMORY;
4567                 return log_oom();
4568         }
4569
4570         /* The $PATH variable is set to the default path in params->environment. However, this is overridden
4571          * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
4572          * not specify PATH but the unit has ExecSearchPath. */
4573         if (!strv_isempty(context->exec_search_path)) {
4574                 _cleanup_free_ char *joined = NULL;
4575
4576                 joined = strv_join(context->exec_search_path, ":");
4577                 if (!joined) {
4578                         *exit_status = EXIT_MEMORY;
4579                         return log_oom();
4580                 }
4581
4582                 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
4583                 if (r < 0) {
4584                         *exit_status = EXIT_MEMORY;
4585                         return log_oom();
4586                 }
4587         }
4588
4589         accum_env = strv_env_merge(params->environment,
4590                                    our_env,
4591                                    joined_exec_search_path,
4592                                    pass_env,
4593                                    context->environment,
4594                                    files_env);
4595         if (!accum_env) {
4596                 *exit_status = EXIT_MEMORY;
4597                 return log_oom();
4598         }
4599         accum_env = strv_env_clean(accum_env);
4600
4601         (void) umask(context->umask);
4602
4603         r = setup_keyring(unit, context, params, uid, gid);
4604         if (r < 0) {
4605                 *exit_status = EXIT_KEYRING;
4606                 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
4607         }
4608
4609         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
4610          * from it. */
4611         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
4612
4613         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
4614          * for it, and the kernel doesn't actually support ambient caps. */
4615         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
4616
4617         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
4618          * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
4619          * desired. */
4620         if (needs_ambient_hack)
4621                 needs_setuid = false;
4622         else
4623                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4624
4625         if (needs_sandboxing) {
4626                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
4627                  * /sys being present. The actual MAC context application will happen later, as late as
4628                  * possible, to avoid impacting our own code paths. */
4629
4630 #if HAVE_SELINUX
4631                 use_selinux = mac_selinux_use();
4632 #endif
4633 #if ENABLE_SMACK
4634                 use_smack = mac_smack_use();
4635 #endif
4636 #if HAVE_APPARMOR
4637                 use_apparmor = mac_apparmor_use();
4638 #endif
4639         }
4640
4641         if (needs_sandboxing) {
4642                 int which_failed;
4643
4644                 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4645                  * is set here. (See below.) */
4646
4647                 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4648                 if (r < 0) {
4649                         *exit_status = EXIT_LIMITS;
4650                         return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4651                 }
4652         }
4653
4654         if (needs_setuid && context->pam_name && username) {
4655                 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
4656                  * wins here. (See above.) */
4657
4658                 /* All fds passed in the fds array will be closed in the pam child process. */
4659                 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
4660                 if (r < 0) {
4661                         *exit_status = EXIT_PAM;
4662                         return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
4663                 }
4664
4665                 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4666                 if (ngids_after_pam < 0) {
4667                         *exit_status = EXIT_MEMORY;
4668                         return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
4669                 }
4670         }
4671
4672         if (needs_sandboxing && context->private_users && !have_effective_cap(CAP_SYS_ADMIN)) {
4673                 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4674                  * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4675                  * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
4676
4677                 userns_set_up = true;
4678                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4679                 if (r < 0) {
4680                         *exit_status = EXIT_USER;
4681                         return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
4682                 }
4683         }
4684
4685         if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
4686
4687                 if (ns_type_supported(NAMESPACE_NET)) {
4688                         r = setup_shareable_ns(runtime->netns_storage_socket, CLONE_NEWNET);
4689                         if (r == -EPERM)
4690                                 log_unit_warning_errno(unit, r,
4691                                                        "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
4692                         else if (r < 0) {
4693                                 *exit_status = EXIT_NETWORK;
4694                                 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
4695                         }
4696                 } else if (context->network_namespace_path) {
4697                         *exit_status = EXIT_NETWORK;
4698                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4699                                                     "NetworkNamespacePath= is not supported, refusing.");
4700                 } else
4701                         log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
4702         }
4703
4704         if ((context->private_ipc || context->ipc_namespace_path) && runtime && runtime->ipcns_storage_socket[0] >= 0) {
4705
4706                 if (ns_type_supported(NAMESPACE_IPC)) {
4707                         r = setup_shareable_ns(runtime->ipcns_storage_socket, CLONE_NEWIPC);
4708                         if (r == -EPERM)
4709                                 log_unit_warning_errno(unit, r,
4710                                                        "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4711                         else if (r < 0) {
4712                                 *exit_status = EXIT_NAMESPACE;
4713                                 return log_unit_error_errno(unit, r, "Failed to set up IPC namespacing: %m");
4714                         }
4715                 } else if (context->ipc_namespace_path) {
4716                         *exit_status = EXIT_NAMESPACE;
4717                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4718                                                     "IPCNamespacePath= is not supported, refusing.");
4719                 } else
4720                         log_unit_warning(unit, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4721         }
4722
4723         if (needs_mount_namespace) {
4724                 _cleanup_free_ char *error_path = NULL;
4725
4726                 r = apply_mount_namespace(unit, command->flags, context, params, runtime, &error_path);
4727                 if (r < 0) {
4728                         *exit_status = EXIT_NAMESPACE;
4729                         return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
4730                                                     error_path ? ": " : "", strempty(error_path));
4731                 }
4732         }
4733
4734         if (needs_sandboxing) {
4735                 r = apply_protect_hostname(unit, context, exit_status);
4736                 if (r < 0)
4737                         return r;
4738         }
4739
4740         /* Drop groups as early as possible.
4741          * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
4742          * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
4743         if (needs_setuid) {
4744                 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4745                 int ngids_to_enforce = 0;
4746
4747                 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4748                                                    ngids,
4749                                                    gids_after_pam,
4750                                                    ngids_after_pam,
4751                                                    &gids_to_enforce);
4752                 if (ngids_to_enforce < 0) {
4753                         *exit_status = EXIT_MEMORY;
4754                         return log_unit_error_errno(unit,
4755                                                     ngids_to_enforce,
4756                                                     "Failed to merge group lists. Group membership might be incorrect: %m");
4757                 }
4758
4759                 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
4760                 if (r < 0) {
4761                         *exit_status = EXIT_GROUP;
4762                         return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
4763                 }
4764         }
4765
4766         /* If the user namespace was not set up above, try to do it now.
4767          * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4768          * restricted by rules pertaining to combining user namspaces with other namespaces (e.g. in the
4769          * case of mount namespaces being less privileged when the mount point list is copied from a
4770          * different user namespace). */
4771
4772         if (needs_sandboxing && context->private_users && !userns_set_up) {
4773                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4774                 if (r < 0) {
4775                         *exit_status = EXIT_USER;
4776                         return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
4777                 }
4778         }
4779
4780         /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4781          * shall execute. */
4782
4783         _cleanup_free_ char *executable = NULL;
4784         _cleanup_close_ int executable_fd = -EBADF;
4785         r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
4786         if (r < 0) {
4787                 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
4788                         log_unit_struct_errno(unit, LOG_INFO, r,
4789                                               "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4790                                               LOG_UNIT_INVOCATION_ID(unit),
4791                                               LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
4792                                                                command->path),
4793                                               "EXECUTABLE=%s", command->path);
4794                         return 0;
4795                 }
4796
4797                 *exit_status = EXIT_EXEC;
4798
4799                 return log_unit_struct_errno(unit, LOG_INFO, r,
4800                                              "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4801                                              LOG_UNIT_INVOCATION_ID(unit),
4802                                              LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
4803                                                               command->path),
4804                                              "EXECUTABLE=%s", command->path);
4805         }
4806
4807         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
4808         if (r < 0) {
4809                 *exit_status = EXIT_FDS;
4810                 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4811         }
4812
4813 #if HAVE_SELINUX
4814         if (needs_sandboxing && use_selinux && params->selinux_context_net) {
4815                 int fd = -EBADF;
4816
4817                 if (socket_fd >= 0)
4818                         fd = socket_fd;
4819                 else if (params->n_socket_fds == 1)
4820                         /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4821                          * use context from that fd to compute the label. */
4822                         fd = params->fds[0];
4823
4824                 if (fd >= 0) {
4825                         r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
4826                         if (r < 0) {
4827                                 if (!context->selinux_context_ignore) {
4828                                         *exit_status = EXIT_SELINUX_CONTEXT;
4829                                         return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
4830                                 }
4831                                 log_unit_debug_errno(unit, r, "Failed to determine SELinux context, ignoring: %m");
4832                         }
4833                 }
4834         }
4835 #endif
4836
4837         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
4838          * more aggressive this time since socket_fd and the netns and ipcns fds we don't need anymore. We do keep the exec_fd
4839          * however if we have it as we want to keep it open until the final execve(). */
4840
4841         r = close_all_fds(keep_fds, n_keep_fds);
4842         if (r >= 0)
4843                 r = shift_fds(fds, n_fds);
4844         if (r >= 0)
4845                 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
4846         if (r < 0) {
4847                 *exit_status = EXIT_FDS;
4848                 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
4849         }
4850
4851         /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4852          * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4853          * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4854          * came this far. */
4855
4856         secure_bits = context->secure_bits;
4857
4858         if (needs_sandboxing) {
4859                 uint64_t bset;
4860
4861                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
4862                  * requested. (Note this is placed after the general resource limit initialization, see
4863                  * above, in order to take precedence.) */
4864                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
4865                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
4866                                 *exit_status = EXIT_LIMITS;
4867                                 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
4868                         }
4869                 }
4870
4871 #if ENABLE_SMACK
4872                 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4873                  * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4874                 if (use_smack) {
4875                         r = setup_smack(unit->manager, context, executable_fd);
4876                         if (r < 0 && !context->smack_process_label_ignore) {
4877                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
4878                                 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
4879                         }
4880                 }
4881 #endif
4882
4883                 bset = context->capability_bounding_set;
4884                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4885                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4886                  * instead of us doing that */
4887                 if (needs_ambient_hack)
4888                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
4889                                 (UINT64_C(1) << CAP_SETUID) |
4890                                 (UINT64_C(1) << CAP_SETGID);
4891
4892                 if (!cap_test_all(bset)) {
4893                         r = capability_bounding_set_drop(bset, false);
4894                         if (r < 0) {
4895                                 *exit_status = EXIT_CAPABILITIES;
4896                                 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
4897                         }
4898                 }
4899
4900                 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4901                  * keep-caps set.
4902                  * To be able to raise the ambient capabilities after setresuid() they have to be
4903                  * added to the inherited set and keep caps has to be set (done in enforce_user()).
4904                  * After setresuid() the ambient capabilities can be raised as they are present in
4905                  * the permitted and inhertiable set. However it is possible that someone wants to
4906                  * set ambient capabilities without changing the user, so we also set the ambient
4907                  * capabilities here.
4908                  * The requested ambient capabilities are raised in the inheritable set if the
4909                  * second argument is true. */
4910                 if (!needs_ambient_hack) {
4911                         r = capability_ambient_set_apply(context->capability_ambient_set, true);
4912                         if (r < 0) {
4913                                 *exit_status = EXIT_CAPABILITIES;
4914                                 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
4915                         }
4916                 }
4917         }
4918
4919         /* chroot to root directory first, before we lose the ability to chroot */
4920         r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
4921         if (r < 0)
4922                 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
4923
4924         if (needs_setuid) {
4925                 if (uid_is_valid(uid)) {
4926                         r = enforce_user(context, uid);
4927                         if (r < 0) {
4928                                 *exit_status = EXIT_USER;
4929                                 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
4930                         }
4931
4932                         if (!needs_ambient_hack &&
4933                             context->capability_ambient_set != 0) {
4934
4935                                 /* Raise the ambient capabilities after user change. */
4936                                 r = capability_ambient_set_apply(context->capability_ambient_set, false);
4937                                 if (r < 0) {
4938                                         *exit_status = EXIT_CAPABILITIES;
4939                                         return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
4940                                 }
4941                         }
4942                 }
4943         }
4944
4945         /* Apply working directory here, because the working directory might be on NFS and only the user running
4946          * this service might have the correct privilege to change to the working directory */
4947         r = apply_working_directory(context, params, home, exit_status);
4948         if (r < 0)
4949                 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
4950
4951         if (needs_sandboxing) {
4952                 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
4953                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
4954                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
4955                  * are restricted. */
4956
4957 #if HAVE_SELINUX
4958                 if (use_selinux) {
4959                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
4960
4961                         if (exec_context) {
4962                                 r = setexeccon(exec_context);
4963                                 if (r < 0) {
4964                                         if (!context->selinux_context_ignore) {
4965                                                 *exit_status = EXIT_SELINUX_CONTEXT;
4966                                                 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
4967                                         }
4968                                         log_unit_debug_errno(unit, r, "Failed to change SELinux context to %s, ignoring: %m", exec_context);
4969                                 }
4970                         }
4971                 }
4972 #endif
4973
4974 #if HAVE_APPARMOR
4975                 if (use_apparmor && context->apparmor_profile) {
4976                         r = aa_change_onexec(context->apparmor_profile);
4977                         if (r < 0 && !context->apparmor_profile_ignore) {
4978                                 *exit_status = EXIT_APPARMOR_PROFILE;
4979                                 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
4980                         }
4981                 }
4982 #endif
4983
4984                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
4985                  * we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits requires
4986                  * CAP_SETPCAP. */
4987                 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
4988                         /* CAP_SETPCAP is required to set securebits. This capability is raised into the
4989                          * effective set here.
4990                          * The effective set is overwritten during execve  with the following  values:
4991                          * - ambient set (for non-root processes)
4992                          * - (inheritable | bounding) set for root processes)
4993                          *
4994                          * Hence there is no security impact to raise it in the effective set before execve
4995                          */
4996                         r = capability_gain_cap_setpcap(NULL);
4997                         if (r < 0) {
4998                                 *exit_status = EXIT_CAPABILITIES;
4999                                 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
5000                         }
5001                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
5002                                 *exit_status = EXIT_SECUREBITS;
5003                                 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
5004                         }
5005                 }
5006
5007                 if (context_has_no_new_privileges(context))
5008                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
5009                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
5010                                 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
5011                         }
5012
5013 #if HAVE_SECCOMP
5014                 r = apply_address_families(unit, context);
5015                 if (r < 0) {
5016                         *exit_status = EXIT_ADDRESS_FAMILIES;
5017                         return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
5018                 }
5019
5020                 r = apply_memory_deny_write_execute(unit, context);
5021                 if (r < 0) {
5022                         *exit_status = EXIT_SECCOMP;
5023                         return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
5024                 }
5025
5026                 r = apply_restrict_realtime(unit, context);
5027                 if (r < 0) {
5028                         *exit_status = EXIT_SECCOMP;
5029                         return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
5030                 }
5031
5032                 r = apply_restrict_suid_sgid(unit, context);
5033                 if (r < 0) {
5034                         *exit_status = EXIT_SECCOMP;
5035                         return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
5036                 }
5037
5038                 r = apply_restrict_namespaces(unit, context);
5039                 if (r < 0) {
5040                         *exit_status = EXIT_SECCOMP;
5041                         return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
5042                 }
5043
5044                 r = apply_protect_sysctl(unit, context);
5045                 if (r < 0) {
5046                         *exit_status = EXIT_SECCOMP;
5047                         return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
5048                 }
5049
5050                 r = apply_protect_kernel_modules(unit, context);
5051                 if (r < 0) {
5052                         *exit_status = EXIT_SECCOMP;
5053                         return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
5054                 }
5055
5056                 r = apply_protect_kernel_logs(unit, context);
5057                 if (r < 0) {
5058                         *exit_status = EXIT_SECCOMP;
5059                         return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
5060                 }
5061
5062                 r = apply_protect_clock(unit, context);
5063                 if (r < 0) {
5064                         *exit_status = EXIT_SECCOMP;
5065                         return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
5066                 }
5067
5068                 r = apply_private_devices(unit, context);
5069                 if (r < 0) {
5070                         *exit_status = EXIT_SECCOMP;
5071                         return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
5072                 }
5073
5074                 r = apply_syscall_archs(unit, context);
5075                 if (r < 0) {
5076                         *exit_status = EXIT_SECCOMP;
5077                         return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
5078                 }
5079
5080                 r = apply_lock_personality(unit, context);
5081                 if (r < 0) {
5082                         *exit_status = EXIT_SECCOMP;
5083                         return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
5084                 }
5085
5086                 r = apply_syscall_log(unit, context);
5087                 if (r < 0) {
5088                         *exit_status = EXIT_SECCOMP;
5089                         return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
5090                 }
5091
5092                 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
5093                  * by the filter as little as possible. */
5094                 r = apply_syscall_filter(unit, context, needs_ambient_hack);
5095                 if (r < 0) {
5096                         *exit_status = EXIT_SECCOMP;
5097                         return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
5098                 }
5099 #endif
5100
5101 #if HAVE_LIBBPF
5102                 r = apply_restrict_filesystems(unit, context);
5103                 if (r < 0) {
5104                         *exit_status = EXIT_BPF;
5105                         return log_unit_error_errno(unit, r, "Failed to restrict filesystems: %m");
5106                 }
5107 #endif
5108
5109         }
5110
5111         if (!strv_isempty(context->unset_environment)) {
5112                 char **ee = NULL;
5113
5114                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
5115                 if (!ee) {
5116                         *exit_status = EXIT_MEMORY;
5117                         return log_oom();
5118                 }
5119
5120                 strv_free_and_replace(accum_env, ee);
5121         }
5122
5123         if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
5124                 replaced_argv = replace_env_argv(command->argv, accum_env);
5125                 if (!replaced_argv) {
5126                         *exit_status = EXIT_MEMORY;
5127                         return log_oom();
5128                 }
5129                 final_argv = replaced_argv;
5130         } else
5131                 final_argv = command->argv;
5132
5133         if (DEBUG_LOGGING) {
5134                 _cleanup_free_ char *line = NULL;
5135
5136                 line = quote_command_line(final_argv, SHELL_ESCAPE_EMPTY);
5137                 if (!line) {
5138                         *exit_status = EXIT_MEMORY;
5139                         return log_oom();
5140                 }
5141
5142                 log_unit_struct(unit, LOG_DEBUG,
5143                                 "EXECUTABLE=%s", executable,
5144                                 LOG_UNIT_MESSAGE(unit, "Executing: %s", line));
5145         }
5146
5147         if (exec_fd >= 0) {
5148                 uint8_t hot = 1;
5149
5150                 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5151                  * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5152
5153                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5154                         *exit_status = EXIT_EXEC;
5155                         return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
5156                 }
5157         }
5158
5159         r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5160
5161         if (exec_fd >= 0) {
5162                 uint8_t hot = 0;
5163
5164                 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5165                  * that POLLHUP on it no longer means execve() succeeded. */
5166
5167                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5168                         *exit_status = EXIT_EXEC;
5169                         return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
5170                 }
5171         }
5172
5173         *exit_status = EXIT_EXEC;
5174         return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
5175 }
5176
5177 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
5178 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
5179
5180 int exec_spawn(Unit *unit,
5181                ExecCommand *command,
5182                const ExecContext *context,
5183                const ExecParameters *params,
5184                ExecRuntime *runtime,
5185                DynamicCreds *dcreds,
5186                pid_t *ret) {
5187
5188         int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
5189         _cleanup_free_ char *subcgroup_path = NULL;
5190         _cleanup_strv_free_ char **files_env = NULL;
5191         size_t n_storage_fds = 0, n_socket_fds = 0;
5192         _cleanup_free_ char *line = NULL;
5193         pid_t pid;
5194
5195         assert(unit);
5196         assert(command);
5197         assert(context);
5198         assert(ret);
5199         assert(params);
5200         assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
5201
5202         if (context->std_input == EXEC_INPUT_SOCKET ||
5203             context->std_output == EXEC_OUTPUT_SOCKET ||
5204             context->std_error == EXEC_OUTPUT_SOCKET) {
5205
5206                 if (params->n_socket_fds > 1)
5207                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
5208
5209                 if (params->n_socket_fds == 0)
5210                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
5211
5212                 socket_fd = params->fds[0];
5213         } else {
5214                 socket_fd = -EBADF;
5215                 fds = params->fds;
5216                 n_socket_fds = params->n_socket_fds;
5217                 n_storage_fds = params->n_storage_fds;
5218         }
5219
5220         r = exec_context_named_iofds(context, params, named_iofds);
5221         if (r < 0)
5222                 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
5223
5224         r = exec_context_load_environment(unit, context, &files_env);
5225         if (r < 0)
5226                 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
5227
5228         line = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
5229         if (!line)
5230                 return log_oom();
5231
5232         /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
5233            and, until the next SELinux policy changes, we save further reloads in future children. */
5234         mac_selinux_maybe_reload();
5235
5236         log_unit_struct(unit, LOG_DEBUG,
5237                         LOG_UNIT_MESSAGE(unit, "About to execute %s", line),
5238                         "EXECUTABLE=%s", command->path, /* We won't know the real executable path until we create
5239                                                            the mount namespace in the child, but we want to log
5240                                                            from the parent, so we need to use the (possibly
5241                                                            inaccurate) path here. */
5242                         LOG_UNIT_INVOCATION_ID(unit));
5243
5244         if (params->cgroup_path) {
5245                 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
5246                 if (r < 0)
5247                         return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
5248                 if (r > 0) { /* We are using a child cgroup */
5249                         r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
5250                         if (r < 0)
5251                                 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
5252
5253                         /* Normally we would not propagate the xattrs to children but since we created this
5254                          * sub-cgroup internally we should do it. */
5255                         cgroup_oomd_xattr_apply(unit, subcgroup_path);
5256                         cgroup_log_xattr_apply(unit, subcgroup_path);
5257                 }
5258         }
5259
5260         pid = fork();
5261         if (pid < 0)
5262                 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
5263
5264         if (pid == 0) {
5265                 int exit_status = EXIT_SUCCESS;
5266
5267                 r = exec_child(unit,
5268                                command,
5269                                context,
5270                                params,
5271                                runtime,
5272                                dcreds,
5273                                socket_fd,
5274                                named_iofds,
5275                                fds,
5276                                n_socket_fds,
5277                                n_storage_fds,
5278                                files_env,
5279                                unit->manager->user_lookup_fds[1],
5280                                &exit_status);
5281
5282                 if (r < 0) {
5283                         const char *status =
5284                                 exit_status_to_string(exit_status,
5285                                                       EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
5286
5287                         log_unit_struct_errno(unit, LOG_ERR, r,
5288                                               "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5289                                               LOG_UNIT_INVOCATION_ID(unit),
5290                                               LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
5291                                                                status, command->path),
5292                                               "EXECUTABLE=%s", command->path);
5293                 }
5294
5295                 _exit(exit_status);
5296         }
5297
5298         log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
5299
5300         /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
5301          * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
5302          * process will be killed too). */
5303         if (subcgroup_path)
5304                 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
5305
5306         exec_status_start(&command->exec_status, pid);
5307
5308         *ret = pid;
5309         return 0;
5310 }
5311
5312 void exec_context_init(ExecContext *c) {
5313         assert(c);
5314
5315         c->umask = 0022;
5316         c->ioprio = IOPRIO_DEFAULT_CLASS_AND_PRIO;
5317         c->cpu_sched_policy = SCHED_OTHER;
5318         c->syslog_priority = LOG_DAEMON|LOG_INFO;
5319         c->syslog_level_prefix = true;
5320         c->ignore_sigpipe = true;
5321         c->timer_slack_nsec = NSEC_INFINITY;
5322         c->personality = PERSONALITY_INVALID;
5323         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5324                 c->directories[t].mode = 0755;
5325         c->timeout_clean_usec = USEC_INFINITY;
5326         c->capability_bounding_set = CAP_ALL;
5327         assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
5328         c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
5329         c->log_level_max = -1;
5330 #if HAVE_SECCOMP
5331         c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL;
5332 #endif
5333         c->tty_rows = UINT_MAX;
5334         c->tty_cols = UINT_MAX;
5335         numa_policy_reset(&c->numa_policy);
5336 }
5337
5338 void exec_context_done(ExecContext *c) {
5339         assert(c);
5340
5341         c->environment = strv_free(c->environment);
5342         c->environment_files = strv_free(c->environment_files);
5343         c->pass_environment = strv_free(c->pass_environment);
5344         c->unset_environment = strv_free(c->unset_environment);
5345
5346         rlimit_free_all(c->rlimit);
5347
5348         for (size_t l = 0; l < 3; l++) {
5349                 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
5350                 c->stdio_file[l] = mfree(c->stdio_file[l]);
5351         }
5352
5353         c->working_directory = mfree(c->working_directory);
5354         c->root_directory = mfree(c->root_directory);
5355         c->root_image = mfree(c->root_image);
5356         c->root_image_options = mount_options_free_all(c->root_image_options);
5357         c->root_hash = mfree(c->root_hash);
5358         c->root_hash_size = 0;
5359         c->root_hash_path = mfree(c->root_hash_path);
5360         c->root_hash_sig = mfree(c->root_hash_sig);
5361         c->root_hash_sig_size = 0;
5362         c->root_hash_sig_path = mfree(c->root_hash_sig_path);
5363         c->root_verity = mfree(c->root_verity);
5364         c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
5365         c->extension_directories = strv_free(c->extension_directories);
5366         c->tty_path = mfree(c->tty_path);
5367         c->syslog_identifier = mfree(c->syslog_identifier);
5368         c->user = mfree(c->user);
5369         c->group = mfree(c->group);
5370
5371         c->supplementary_groups = strv_free(c->supplementary_groups);
5372
5373         c->pam_name = mfree(c->pam_name);
5374
5375         c->read_only_paths = strv_free(c->read_only_paths);
5376         c->read_write_paths = strv_free(c->read_write_paths);
5377         c->inaccessible_paths = strv_free(c->inaccessible_paths);
5378         c->exec_paths = strv_free(c->exec_paths);
5379         c->no_exec_paths = strv_free(c->no_exec_paths);
5380         c->exec_search_path = strv_free(c->exec_search_path);
5381
5382         bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
5383         c->bind_mounts = NULL;
5384         c->n_bind_mounts = 0;
5385         temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
5386         c->temporary_filesystems = NULL;
5387         c->n_temporary_filesystems = 0;
5388         c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
5389
5390         cpu_set_reset(&c->cpu_set);
5391         numa_policy_reset(&c->numa_policy);
5392
5393         c->utmp_id = mfree(c->utmp_id);
5394         c->selinux_context = mfree(c->selinux_context);
5395         c->apparmor_profile = mfree(c->apparmor_profile);
5396         c->smack_process_label = mfree(c->smack_process_label);
5397
5398         c->restrict_filesystems = set_free(c->restrict_filesystems);
5399
5400         c->syscall_filter = hashmap_free(c->syscall_filter);
5401         c->syscall_archs = set_free(c->syscall_archs);
5402         c->address_families = set_free(c->address_families);
5403
5404         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5405                 exec_directory_done(&c->directories[t]);
5406
5407         c->log_level_max = -1;
5408
5409         exec_context_free_log_extra_fields(c);
5410         c->log_filter_allowed_patterns = set_free(c->log_filter_allowed_patterns);
5411         c->log_filter_denied_patterns = set_free(c->log_filter_denied_patterns);
5412
5413         c->log_ratelimit_interval_usec = 0;
5414         c->log_ratelimit_burst = 0;
5415
5416         c->stdin_data = mfree(c->stdin_data);
5417         c->stdin_data_size = 0;
5418
5419         c->network_namespace_path = mfree(c->network_namespace_path);
5420         c->ipc_namespace_path = mfree(c->ipc_namespace_path);
5421
5422         c->log_namespace = mfree(c->log_namespace);
5423
5424         c->load_credentials = hashmap_free(c->load_credentials);
5425         c->set_credentials = hashmap_free(c->set_credentials);
5426 }
5427
5428 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
5429         assert(c);
5430
5431         if (!runtime_prefix)
5432                 return 0;
5433
5434         for (size_t i = 0; i < c->directories[EXEC_DIRECTORY_RUNTIME].n_items; i++) {
5435                 _cleanup_free_ char *p = NULL;
5436
5437                 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5438                         p = path_join(runtime_prefix, "private", c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
5439                 else
5440                         p = path_join(runtime_prefix, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
5441                 if (!p)
5442                         return -ENOMEM;
5443
5444                 /* We execute this synchronously, since we need to be sure this is gone when we start the
5445                  * service next. */
5446                 (void) rm_rf(p, REMOVE_ROOT);
5447
5448                 STRV_FOREACH(symlink, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].symlinks) {
5449                         _cleanup_free_ char *symlink_abs = NULL;
5450
5451                         if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5452                                 symlink_abs = path_join(runtime_prefix, "private", *symlink);
5453                         else
5454                                 symlink_abs = path_join(runtime_prefix, *symlink);
5455                         if (!symlink_abs)
5456                                 return -ENOMEM;
5457
5458                         (void) unlink(symlink_abs);
5459                 }
5460
5461         }
5462
5463         return 0;
5464 }
5465
5466 int exec_context_destroy_credentials(const ExecContext *c, const char *runtime_prefix, const char *unit) {
5467         _cleanup_free_ char *p = NULL;
5468
5469         assert(c);
5470
5471         if (!runtime_prefix || !unit)
5472                 return 0;
5473
5474         p = path_join(runtime_prefix, "credentials", unit);
5475         if (!p)
5476                 return -ENOMEM;
5477
5478         /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
5479          * unmount it, and afterwards remove the mount point */
5480         (void) umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW);
5481         (void) rm_rf(p, REMOVE_ROOT|REMOVE_CHMOD);
5482
5483         return 0;
5484 }
5485
5486 static void exec_command_done(ExecCommand *c) {
5487         assert(c);
5488
5489         c->path = mfree(c->path);
5490         c->argv = strv_free(c->argv);
5491 }
5492
5493 void exec_command_done_array(ExecCommand *c, size_t n) {
5494         for (size_t i = 0; i < n; i++)
5495                 exec_command_done(c+i);
5496 }
5497
5498 ExecCommand* exec_command_free_list(ExecCommand *c) {
5499         ExecCommand *i;
5500
5501         while ((i = c)) {
5502                 LIST_REMOVE(command, c, i);
5503                 exec_command_done(i);
5504                 free(i);
5505         }
5506
5507         return NULL;
5508 }
5509
5510 void exec_command_free_array(ExecCommand **c, size_t n) {
5511         for (size_t i = 0; i < n; i++)
5512                 c[i] = exec_command_free_list(c[i]);
5513 }
5514
5515 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
5516         for (size_t i = 0; i < n; i++)
5517                 exec_status_reset(&c[i].exec_status);
5518 }
5519
5520 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
5521         for (size_t i = 0; i < n; i++)
5522                 LIST_FOREACH(command, z, c[i])
5523                         exec_status_reset(&z->exec_status);
5524 }
5525
5526 typedef struct InvalidEnvInfo {
5527         const Unit *unit;
5528         const char *path;
5529 } InvalidEnvInfo;
5530
5531 static void invalid_env(const char *p, void *userdata) {
5532         InvalidEnvInfo *info = userdata;
5533
5534         log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
5535 }
5536
5537 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
5538         assert(c);
5539
5540         switch (fd_index) {
5541
5542         case STDIN_FILENO:
5543                 if (c->std_input != EXEC_INPUT_NAMED_FD)
5544                         return NULL;
5545
5546                 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
5547
5548         case STDOUT_FILENO:
5549                 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
5550                         return NULL;
5551
5552                 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
5553
5554         case STDERR_FILENO:
5555                 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
5556                         return NULL;
5557
5558                 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
5559
5560         default:
5561                 return NULL;
5562         }
5563 }
5564
5565 static int exec_context_named_iofds(
5566                 const ExecContext *c,
5567                 const ExecParameters *p,
5568                 int named_iofds[static 3]) {
5569
5570         size_t targets;
5571         const char* stdio_fdname[3];
5572         size_t n_fds;
5573
5574         assert(c);
5575         assert(p);
5576         assert(named_iofds);
5577
5578         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
5579                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
5580                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
5581
5582         for (size_t i = 0; i < 3; i++)
5583                 stdio_fdname[i] = exec_context_fdname(c, i);
5584
5585         n_fds = p->n_storage_fds + p->n_socket_fds;
5586
5587         for (size_t i = 0; i < n_fds  && targets > 0; i++)
5588                 if (named_iofds[STDIN_FILENO] < 0 &&
5589                     c->std_input == EXEC_INPUT_NAMED_FD &&
5590                     stdio_fdname[STDIN_FILENO] &&
5591                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
5592
5593                         named_iofds[STDIN_FILENO] = p->fds[i];
5594                         targets--;
5595
5596                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
5597                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
5598                            stdio_fdname[STDOUT_FILENO] &&
5599                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
5600
5601                         named_iofds[STDOUT_FILENO] = p->fds[i];
5602                         targets--;
5603
5604                 } else if (named_iofds[STDERR_FILENO] < 0 &&
5605                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
5606                            stdio_fdname[STDERR_FILENO] &&
5607                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
5608
5609                         named_iofds[STDERR_FILENO] = p->fds[i];
5610                         targets--;
5611                 }
5612
5613         return targets == 0 ? 0 : -ENOENT;
5614 }
5615
5616 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***ret) {
5617         _cleanup_strv_free_ char **v = NULL;
5618         int r;
5619
5620         assert(c);
5621         assert(ret);
5622
5623         STRV_FOREACH(i, c->environment_files) {
5624                 _cleanup_globfree_ glob_t pglob = {};
5625                 bool ignore = false;
5626                 char *fn = *i;
5627
5628                 if (fn[0] == '-') {
5629                         ignore = true;
5630                         fn++;
5631                 }
5632
5633                 if (!path_is_absolute(fn)) {
5634                         if (ignore)
5635                                 continue;
5636                         return -EINVAL;
5637                 }
5638
5639                 /* Filename supports globbing, take all matching files */
5640                 r = safe_glob(fn, 0, &pglob);
5641                 if (r < 0) {
5642                         if (ignore)
5643                                 continue;
5644                         return r;
5645                 }
5646
5647                 /* When we don't match anything, -ENOENT should be returned */
5648                 assert(pglob.gl_pathc > 0);
5649
5650                 for (unsigned n = 0; n < pglob.gl_pathc; n++) {
5651                         _cleanup_strv_free_ char **p = NULL;
5652
5653                         r = load_env_file(NULL, pglob.gl_pathv[n], &p);
5654                         if (r < 0) {
5655                                 if (ignore)
5656                                         continue;
5657                                 return r;
5658                         }
5659
5660                         /* Log invalid environment variables with filename */
5661                         if (p) {
5662                                 InvalidEnvInfo info = {
5663                                         .unit = unit,
5664                                         .path = pglob.gl_pathv[n]
5665                                 };
5666
5667                                 p = strv_env_clean_with_callback(p, invalid_env, &info);
5668                         }
5669
5670                         if (!v)
5671                                 v = TAKE_PTR(p);
5672                         else {
5673                                 char **m = strv_env_merge(v, p);
5674                                 if (!m)
5675                                         return -ENOMEM;
5676
5677                                 strv_free_and_replace(v, m);
5678                         }
5679                 }
5680         }
5681
5682         *ret = TAKE_PTR(v);
5683
5684         return 0;
5685 }
5686
5687 static bool tty_may_match_dev_console(const char *tty) {
5688         _cleanup_free_ char *resolved = NULL;
5689
5690         if (!tty)
5691                 return true;
5692
5693         tty = skip_dev_prefix(tty);
5694
5695         /* trivial identity? */
5696         if (streq(tty, "console"))
5697                 return true;
5698
5699         if (resolve_dev_console(&resolved) < 0)
5700                 return true; /* if we could not resolve, assume it may */
5701
5702         /* "tty0" means the active VC, so it may be the same sometimes */
5703         return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
5704 }
5705
5706 static bool exec_context_may_touch_tty(const ExecContext *ec) {
5707         assert(ec);
5708
5709         return ec->tty_reset ||
5710                 ec->tty_vhangup ||
5711                 ec->tty_vt_disallocate ||
5712                 is_terminal_input(ec->std_input) ||
5713                 is_terminal_output(ec->std_output) ||
5714                 is_terminal_output(ec->std_error);
5715 }
5716
5717 bool exec_context_may_touch_console(const ExecContext *ec) {
5718
5719         return exec_context_may_touch_tty(ec) &&
5720                tty_may_match_dev_console(exec_context_tty_path(ec));
5721 }
5722
5723 static void strv_fprintf(FILE *f, char **l) {
5724         assert(f);
5725
5726         STRV_FOREACH(g, l)
5727                 fprintf(f, " %s", *g);
5728 }
5729
5730 static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
5731         assert(f);
5732         assert(prefix);
5733         assert(name);
5734
5735         if (!strv_isempty(strv)) {
5736                 fprintf(f, "%s%s:", prefix, name);
5737                 strv_fprintf(f, strv);
5738                 fputs("\n", f);
5739         }
5740 }
5741
5742 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
5743         int r;
5744
5745         assert(c);
5746         assert(f);
5747
5748         prefix = strempty(prefix);
5749
5750         fprintf(f,
5751                 "%sUMask: %04o\n"
5752                 "%sWorkingDirectory: %s\n"
5753                 "%sRootDirectory: %s\n"
5754                 "%sNonBlocking: %s\n"
5755                 "%sPrivateTmp: %s\n"
5756                 "%sPrivateDevices: %s\n"
5757                 "%sProtectKernelTunables: %s\n"
5758                 "%sProtectKernelModules: %s\n"
5759                 "%sProtectKernelLogs: %s\n"
5760                 "%sProtectClock: %s\n"
5761                 "%sProtectControlGroups: %s\n"
5762                 "%sPrivateNetwork: %s\n"
5763                 "%sPrivateUsers: %s\n"
5764                 "%sProtectHome: %s\n"
5765                 "%sProtectSystem: %s\n"
5766                 "%sMountAPIVFS: %s\n"
5767                 "%sIgnoreSIGPIPE: %s\n"
5768                 "%sMemoryDenyWriteExecute: %s\n"
5769                 "%sRestrictRealtime: %s\n"
5770                 "%sRestrictSUIDSGID: %s\n"
5771                 "%sKeyringMode: %s\n"
5772                 "%sProtectHostname: %s\n"
5773                 "%sProtectProc: %s\n"
5774                 "%sProcSubset: %s\n",
5775                 prefix, c->umask,
5776                 prefix, empty_to_root(c->working_directory),
5777                 prefix, empty_to_root(c->root_directory),
5778                 prefix, yes_no(c->non_blocking),
5779                 prefix, yes_no(c->private_tmp),
5780                 prefix, yes_no(c->private_devices),
5781                 prefix, yes_no(c->protect_kernel_tunables),
5782                 prefix, yes_no(c->protect_kernel_modules),
5783                 prefix, yes_no(c->protect_kernel_logs),
5784                 prefix, yes_no(c->protect_clock),
5785                 prefix, yes_no(c->protect_control_groups),
5786                 prefix, yes_no(c->private_network),
5787                 prefix, yes_no(c->private_users),
5788                 prefix, protect_home_to_string(c->protect_home),
5789                 prefix, protect_system_to_string(c->protect_system),
5790                 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
5791                 prefix, yes_no(c->ignore_sigpipe),
5792                 prefix, yes_no(c->memory_deny_write_execute),
5793                 prefix, yes_no(c->restrict_realtime),
5794                 prefix, yes_no(c->restrict_suid_sgid),
5795                 prefix, exec_keyring_mode_to_string(c->keyring_mode),
5796                 prefix, yes_no(c->protect_hostname),
5797                 prefix, protect_proc_to_string(c->protect_proc),
5798                 prefix, proc_subset_to_string(c->proc_subset));
5799
5800         if (c->root_image)
5801                 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
5802
5803         if (c->root_image_options) {
5804                 fprintf(f, "%sRootImageOptions:", prefix);
5805                 LIST_FOREACH(mount_options, o, c->root_image_options)
5806                         if (!isempty(o->options))
5807                                 fprintf(f, " %s:%s",
5808                                         partition_designator_to_string(o->partition_designator),
5809                                         o->options);
5810                 fprintf(f, "\n");
5811         }
5812
5813         if (c->root_hash) {
5814                 _cleanup_free_ char *encoded = NULL;
5815                 encoded = hexmem(c->root_hash, c->root_hash_size);
5816                 if (encoded)
5817                         fprintf(f, "%sRootHash: %s\n", prefix, encoded);
5818         }
5819
5820         if (c->root_hash_path)
5821                 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
5822
5823         if (c->root_hash_sig) {
5824                 _cleanup_free_ char *encoded = NULL;
5825                 ssize_t len;
5826                 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
5827                 if (len)
5828                         fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
5829         }
5830
5831         if (c->root_hash_sig_path)
5832                 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
5833
5834         if (c->root_verity)
5835                 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
5836
5837         STRV_FOREACH(e, c->environment)
5838                 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
5839
5840         STRV_FOREACH(e, c->environment_files)
5841                 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
5842
5843         STRV_FOREACH(e, c->pass_environment)
5844                 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
5845
5846         STRV_FOREACH(e, c->unset_environment)
5847                 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
5848
5849         fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
5850
5851         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
5852                 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
5853
5854                 for (size_t i = 0; i < c->directories[dt].n_items; i++) {
5855                         fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].items[i].path);
5856
5857                         STRV_FOREACH(d, c->directories[dt].items[i].symlinks)
5858                                 fprintf(f, "%s%s: %s:%s\n", prefix, exec_directory_type_symlink_to_string(dt), c->directories[dt].items[i].path, *d);
5859                 }
5860         }
5861
5862         fprintf(f, "%sTimeoutCleanSec: %s\n", prefix, FORMAT_TIMESPAN(c->timeout_clean_usec, USEC_PER_SEC));
5863
5864         if (c->nice_set)
5865                 fprintf(f, "%sNice: %i\n", prefix, c->nice);
5866
5867         if (c->oom_score_adjust_set)
5868                 fprintf(f, "%sOOMScoreAdjust: %i\n", prefix, c->oom_score_adjust);
5869
5870         if (c->coredump_filter_set)
5871                 fprintf(f, "%sCoredumpFilter: 0x%"PRIx64"\n", prefix, c->coredump_filter);
5872
5873         for (unsigned i = 0; i < RLIM_NLIMITS; i++)
5874                 if (c->rlimit[i]) {
5875                         fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
5876                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
5877                         fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
5878                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
5879                 }
5880
5881         if (c->ioprio_set) {
5882                 _cleanup_free_ char *class_str = NULL;
5883
5884                 r = ioprio_class_to_string_alloc(ioprio_prio_class(c->ioprio), &class_str);
5885                 if (r >= 0)
5886                         fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
5887
5888                 fprintf(f, "%sIOPriority: %d\n", prefix, ioprio_prio_data(c->ioprio));
5889         }
5890
5891         if (c->cpu_sched_set) {
5892                 _cleanup_free_ char *policy_str = NULL;
5893
5894                 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
5895                 if (r >= 0)
5896                         fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
5897
5898                 fprintf(f,
5899                         "%sCPUSchedulingPriority: %i\n"
5900                         "%sCPUSchedulingResetOnFork: %s\n",
5901                         prefix, c->cpu_sched_priority,
5902                         prefix, yes_no(c->cpu_sched_reset_on_fork));
5903         }
5904
5905         if (c->cpu_set.set) {
5906                 _cleanup_free_ char *affinity = NULL;
5907
5908                 affinity = cpu_set_to_range_string(&c->cpu_set);
5909                 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
5910         }
5911
5912         if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
5913                 _cleanup_free_ char *nodes = NULL;
5914
5915                 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
5916                 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
5917                 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
5918         }
5919
5920         if (c->timer_slack_nsec != NSEC_INFINITY)
5921                 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
5922
5923         fprintf(f,
5924                 "%sStandardInput: %s\n"
5925                 "%sStandardOutput: %s\n"
5926                 "%sStandardError: %s\n",
5927                 prefix, exec_input_to_string(c->std_input),
5928                 prefix, exec_output_to_string(c->std_output),
5929                 prefix, exec_output_to_string(c->std_error));
5930
5931         if (c->std_input == EXEC_INPUT_NAMED_FD)
5932                 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
5933         if (c->std_output == EXEC_OUTPUT_NAMED_FD)
5934                 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
5935         if (c->std_error == EXEC_OUTPUT_NAMED_FD)
5936                 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
5937
5938         if (c->std_input == EXEC_INPUT_FILE)
5939                 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
5940         if (c->std_output == EXEC_OUTPUT_FILE)
5941                 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5942         if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
5943                 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5944         if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
5945                 fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5946         if (c->std_error == EXEC_OUTPUT_FILE)
5947                 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5948         if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
5949                 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5950         if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
5951                 fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5952
5953         if (c->tty_path)
5954                 fprintf(f,
5955                         "%sTTYPath: %s\n"
5956                         "%sTTYReset: %s\n"
5957                         "%sTTYVHangup: %s\n"
5958                         "%sTTYVTDisallocate: %s\n"
5959                         "%sTTYRows: %u\n"
5960                         "%sTTYColumns: %u\n",
5961                         prefix, c->tty_path,
5962                         prefix, yes_no(c->tty_reset),
5963                         prefix, yes_no(c->tty_vhangup),
5964                         prefix, yes_no(c->tty_vt_disallocate),
5965                         prefix, c->tty_rows,
5966                         prefix, c->tty_cols);
5967
5968         if (IN_SET(c->std_output,
5969                    EXEC_OUTPUT_KMSG,
5970                    EXEC_OUTPUT_JOURNAL,
5971                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
5972                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
5973             IN_SET(c->std_error,
5974                    EXEC_OUTPUT_KMSG,
5975                    EXEC_OUTPUT_JOURNAL,
5976                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
5977                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
5978
5979                 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
5980
5981                 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
5982                 if (r >= 0)
5983                         fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
5984
5985                 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
5986                 if (r >= 0)
5987                         fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
5988         }
5989
5990         if (c->log_level_max >= 0) {
5991                 _cleanup_free_ char *t = NULL;
5992
5993                 (void) log_level_to_string_alloc(c->log_level_max, &t);
5994
5995                 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
5996         }
5997
5998         if (c->log_ratelimit_interval_usec > 0)
5999                 fprintf(f,
6000                         "%sLogRateLimitIntervalSec: %s\n",
6001                         prefix, FORMAT_TIMESPAN(c->log_ratelimit_interval_usec, USEC_PER_SEC));
6002
6003         if (c->log_ratelimit_burst > 0)
6004                 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
6005
6006         if (!set_isempty(c->log_filter_allowed_patterns) || !set_isempty(c->log_filter_denied_patterns)) {
6007                 fprintf(f, "%sLogFilterPatterns:", prefix);
6008
6009                 char *pattern;
6010                 SET_FOREACH(pattern, c->log_filter_allowed_patterns)
6011                         fprintf(f, " %s", pattern);
6012                 SET_FOREACH(pattern, c->log_filter_denied_patterns)
6013                         fprintf(f, " ~%s", pattern);
6014                 fputc('\n', f);
6015         }
6016
6017         for (size_t j = 0; j < c->n_log_extra_fields; j++) {
6018                 fprintf(f, "%sLogExtraFields: ", prefix);
6019                 fwrite(c->log_extra_fields[j].iov_base,
6020                        1, c->log_extra_fields[j].iov_len,
6021                        f);
6022                 fputc('\n', f);
6023         }
6024
6025         if (c->log_namespace)
6026                 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
6027
6028         if (c->secure_bits) {
6029                 _cleanup_free_ char *str = NULL;
6030
6031                 r = secure_bits_to_string_alloc(c->secure_bits, &str);
6032                 if (r >= 0)
6033                         fprintf(f, "%sSecure Bits: %s\n", prefix, str);
6034         }
6035
6036         if (c->capability_bounding_set != CAP_ALL) {
6037                 _cleanup_free_ char *str = NULL;
6038
6039                 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
6040                 if (r >= 0)
6041                         fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
6042         }
6043
6044         if (c->capability_ambient_set != 0) {
6045                 _cleanup_free_ char *str = NULL;
6046
6047                 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
6048                 if (r >= 0)
6049                         fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
6050         }
6051
6052         if (c->user)
6053                 fprintf(f, "%sUser: %s\n", prefix, c->user);
6054         if (c->group)
6055                 fprintf(f, "%sGroup: %s\n", prefix, c->group);
6056
6057         fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
6058
6059         strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
6060
6061         if (c->pam_name)
6062                 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
6063
6064         strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
6065         strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
6066         strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
6067         strv_dump(f, prefix, "ExecPaths", c->exec_paths);
6068         strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
6069         strv_dump(f, prefix, "ExecSearchPath", c->exec_search_path);
6070
6071         for (size_t i = 0; i < c->n_bind_mounts; i++)
6072                 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
6073                         c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
6074                         c->bind_mounts[i].ignore_enoent ? "-": "",
6075                         c->bind_mounts[i].source,
6076                         c->bind_mounts[i].destination,
6077                         c->bind_mounts[i].recursive ? "rbind" : "norbind");
6078
6079         for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
6080                 const TemporaryFileSystem *t = c->temporary_filesystems + i;
6081
6082                 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
6083                         t->path,
6084                         isempty(t->options) ? "" : ":",
6085                         strempty(t->options));
6086         }
6087
6088         if (c->utmp_id)
6089                 fprintf(f,
6090                         "%sUtmpIdentifier: %s\n",
6091                         prefix, c->utmp_id);
6092
6093         if (c->selinux_context)
6094                 fprintf(f,
6095                         "%sSELinuxContext: %s%s\n",
6096                         prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
6097
6098         if (c->apparmor_profile)
6099                 fprintf(f,
6100                         "%sAppArmorProfile: %s%s\n",
6101                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
6102
6103         if (c->smack_process_label)
6104                 fprintf(f,
6105                         "%sSmackProcessLabel: %s%s\n",
6106                         prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
6107
6108         if (c->personality != PERSONALITY_INVALID)
6109                 fprintf(f,
6110                         "%sPersonality: %s\n",
6111                         prefix, strna(personality_to_string(c->personality)));
6112
6113         fprintf(f,
6114                 "%sLockPersonality: %s\n",
6115                 prefix, yes_no(c->lock_personality));
6116
6117         if (c->syscall_filter) {
6118                 fprintf(f,
6119                         "%sSystemCallFilter: ",
6120                         prefix);
6121
6122                 if (!c->syscall_allow_list)
6123                         fputc('~', f);
6124
6125 #if HAVE_SECCOMP
6126                 void *id, *val;
6127                 bool first = true;
6128                 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
6129                         _cleanup_free_ char *name = NULL;
6130                         const char *errno_name = NULL;
6131                         int num = PTR_TO_INT(val);
6132
6133                         if (first)
6134                                 first = false;
6135                         else
6136                                 fputc(' ', f);
6137
6138                         name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
6139                         fputs(strna(name), f);
6140
6141                         if (num >= 0) {
6142                                 errno_name = seccomp_errno_or_action_to_string(num);
6143                                 if (errno_name)
6144                                         fprintf(f, ":%s", errno_name);
6145                                 else
6146                                         fprintf(f, ":%d", num);
6147                         }
6148                 }
6149 #endif
6150
6151                 fputc('\n', f);
6152         }
6153
6154         if (c->syscall_archs) {
6155                 fprintf(f,
6156                         "%sSystemCallArchitectures:",
6157                         prefix);
6158
6159 #if HAVE_SECCOMP
6160                 void *id;
6161                 SET_FOREACH(id, c->syscall_archs)
6162                         fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
6163 #endif
6164                 fputc('\n', f);
6165         }
6166
6167         if (exec_context_restrict_namespaces_set(c)) {
6168                 _cleanup_free_ char *s = NULL;
6169
6170                 r = namespace_flags_to_string(c->restrict_namespaces, &s);
6171                 if (r >= 0)
6172                         fprintf(f, "%sRestrictNamespaces: %s\n",
6173                                 prefix, strna(s));
6174         }
6175
6176 #if HAVE_LIBBPF
6177         if (exec_context_restrict_filesystems_set(c)) {
6178                 char *fs;
6179                 SET_FOREACH(fs, c->restrict_filesystems)
6180                         fprintf(f, "%sRestrictFileSystems: %s\n", prefix, fs);
6181         }
6182 #endif
6183
6184         if (c->network_namespace_path)
6185                 fprintf(f,
6186                         "%sNetworkNamespacePath: %s\n",
6187                         prefix, c->network_namespace_path);
6188
6189         if (c->syscall_errno > 0) {
6190                 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
6191
6192 #if HAVE_SECCOMP
6193                 const char *errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
6194                 if (errno_name)
6195                         fputs(errno_name, f);
6196                 else
6197                         fprintf(f, "%d", c->syscall_errno);
6198 #endif
6199                 fputc('\n', f);
6200         }
6201
6202         for (size_t i = 0; i < c->n_mount_images; i++) {
6203                 fprintf(f, "%sMountImages: %s%s:%s", prefix,
6204                         c->mount_images[i].ignore_enoent ? "-": "",
6205                         c->mount_images[i].source,
6206                         c->mount_images[i].destination);
6207                 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
6208                         fprintf(f, ":%s:%s",
6209                                 partition_designator_to_string(o->partition_designator),
6210                                 strempty(o->options));
6211                 fprintf(f, "\n");
6212         }
6213
6214         for (size_t i = 0; i < c->n_extension_images; i++) {
6215                 fprintf(f, "%sExtensionImages: %s%s", prefix,
6216                         c->extension_images[i].ignore_enoent ? "-": "",
6217                         c->extension_images[i].source);
6218                 LIST_FOREACH(mount_options, o, c->extension_images[i].mount_options)
6219                         fprintf(f, ":%s:%s",
6220                                 partition_designator_to_string(o->partition_designator),
6221                                 strempty(o->options));
6222                 fprintf(f, "\n");
6223         }
6224
6225         strv_dump(f, prefix, "ExtensionDirectories", c->extension_directories);
6226 }
6227
6228 bool exec_context_maintains_privileges(const ExecContext *c) {
6229         assert(c);
6230
6231         /* Returns true if the process forked off would run under
6232          * an unchanged UID or as root. */
6233
6234         if (!c->user)
6235                 return true;
6236
6237         if (streq(c->user, "root") || streq(c->user, "0"))
6238                 return true;
6239
6240         return false;
6241 }
6242
6243 int exec_context_get_effective_ioprio(const ExecContext *c) {
6244         int p;
6245
6246         assert(c);
6247
6248         if (c->ioprio_set)
6249                 return c->ioprio;
6250
6251         p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
6252         if (p < 0)
6253                 return IOPRIO_DEFAULT_CLASS_AND_PRIO;
6254
6255         return ioprio_normalize(p);
6256 }
6257
6258 bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
6259         assert(c);
6260
6261         /* Explicit setting wins */
6262         if (c->mount_apivfs_set)
6263                 return c->mount_apivfs;
6264
6265         /* Default to "yes" if root directory or image are specified */
6266         if (exec_context_with_rootfs(c))
6267                 return true;
6268
6269         return false;
6270 }
6271
6272 void exec_context_free_log_extra_fields(ExecContext *c) {
6273         assert(c);
6274
6275         for (size_t l = 0; l < c->n_log_extra_fields; l++)
6276                 free(c->log_extra_fields[l].iov_base);
6277         c->log_extra_fields = mfree(c->log_extra_fields);
6278         c->n_log_extra_fields = 0;
6279 }
6280
6281 void exec_context_revert_tty(ExecContext *c) {
6282         _cleanup_close_ int fd = -EBADF;
6283         const char *path;
6284         struct stat st;
6285         int r;
6286
6287         assert(c);
6288
6289         /* First, reset the TTY (possibly kicking everybody else from the TTY) */
6290         exec_context_tty_reset(c, NULL);
6291
6292         /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
6293          * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
6294          * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
6295         if (!exec_context_may_touch_tty(c))
6296                 return;
6297
6298         path = exec_context_tty_path(c);
6299         if (!path)
6300                 return;
6301
6302         fd = open(path, O_PATH|O_CLOEXEC);
6303         if (fd < 0)
6304                 return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
6305                                              "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
6306                                              path);
6307
6308         if (fstat(fd, &st) < 0)
6309                 return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
6310
6311         /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
6312          * if things are a character device, since a proper check either means we'd have to open the TTY and
6313          * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
6314          * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
6315          * with this at all? → https://github.com/systemd/systemd/issues/19213 */
6316         if (!S_ISCHR(st.st_mode))
6317                 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
6318
6319         r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
6320         if (r < 0)
6321                 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
6322 }
6323
6324 int exec_context_get_clean_directories(
6325                 ExecContext *c,
6326                 char **prefix,
6327                 ExecCleanMask mask,
6328                 char ***ret) {
6329
6330         _cleanup_strv_free_ char **l = NULL;
6331         int r;
6332
6333         assert(c);
6334         assert(prefix);
6335         assert(ret);
6336
6337         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6338                 if (!FLAGS_SET(mask, 1U << t))
6339                         continue;
6340
6341                 if (!prefix[t])
6342                         continue;
6343
6344                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
6345                         char *j;
6346
6347                         j = path_join(prefix[t], c->directories[t].items[i].path);
6348                         if (!j)
6349                                 return -ENOMEM;
6350
6351                         r = strv_consume(&l, j);
6352                         if (r < 0)
6353                                 return r;
6354
6355                         /* Also remove private directories unconditionally. */
6356                         if (t != EXEC_DIRECTORY_CONFIGURATION) {
6357                                 j = path_join(prefix[t], "private", c->directories[t].items[i].path);
6358                                 if (!j)
6359                                         return -ENOMEM;
6360
6361                                 r = strv_consume(&l, j);
6362                                 if (r < 0)
6363                                         return r;
6364                         }
6365
6366                         STRV_FOREACH(symlink, c->directories[t].items[i].symlinks) {
6367                                 j = path_join(prefix[t], *symlink);
6368                                 if (!j)
6369                                         return -ENOMEM;
6370
6371                                 r = strv_consume(&l, j);
6372                                 if (r < 0)
6373                                         return r;
6374                         }
6375                 }
6376         }
6377
6378         *ret = TAKE_PTR(l);
6379         return 0;
6380 }
6381
6382 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
6383         ExecCleanMask mask = 0;
6384
6385         assert(c);
6386         assert(ret);
6387
6388         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
6389                 if (c->directories[t].n_items > 0)
6390                         mask |= 1U << t;
6391
6392         *ret = mask;
6393         return 0;
6394 }
6395
6396 void exec_status_start(ExecStatus *s, pid_t pid) {
6397         assert(s);
6398
6399         *s = (ExecStatus) {
6400                 .pid = pid,
6401         };
6402
6403         dual_timestamp_get(&s->start_timestamp);
6404 }
6405
6406 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
6407         assert(s);
6408
6409         if (s->pid != pid)
6410                 *s = (ExecStatus) {
6411                         .pid = pid,
6412                 };
6413
6414         dual_timestamp_get(&s->exit_timestamp);
6415
6416         s->code = code;
6417         s->status = status;
6418
6419         if (context && context->utmp_id)
6420                 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
6421 }
6422
6423 void exec_status_reset(ExecStatus *s) {
6424         assert(s);
6425
6426         *s = (ExecStatus) {};
6427 }
6428
6429 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
6430         assert(s);
6431         assert(f);
6432
6433         if (s->pid <= 0)
6434                 return;
6435
6436         prefix = strempty(prefix);
6437
6438         fprintf(f,
6439                 "%sPID: "PID_FMT"\n",
6440                 prefix, s->pid);
6441
6442         if (dual_timestamp_is_set(&s->start_timestamp))
6443                 fprintf(f,
6444                         "%sStart Timestamp: %s\n",
6445                         prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
6446
6447         if (dual_timestamp_is_set(&s->exit_timestamp))
6448                 fprintf(f,
6449                         "%sExit Timestamp: %s\n"
6450                         "%sExit Code: %s\n"
6451                         "%sExit Status: %i\n",
6452                         prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime),
6453                         prefix, sigchld_code_to_string(s->code),
6454                         prefix, s->status);
6455 }
6456
6457 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
6458         _cleanup_free_ char *cmd = NULL;
6459         const char *prefix2;
6460
6461         assert(c);
6462         assert(f);
6463
6464         prefix = strempty(prefix);
6465         prefix2 = strjoina(prefix, "\t");
6466
6467         cmd = quote_command_line(c->argv, SHELL_ESCAPE_EMPTY);
6468
6469         fprintf(f,
6470                 "%sCommand Line: %s\n",
6471                 prefix, strnull(cmd));
6472
6473         exec_status_dump(&c->exec_status, f, prefix2);
6474 }
6475
6476 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
6477         assert(f);
6478
6479         prefix = strempty(prefix);
6480
6481         LIST_FOREACH(command, i, c)
6482                 exec_command_dump(i, f, prefix);
6483 }
6484
6485 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
6486         ExecCommand *end;
6487
6488         assert(l);
6489         assert(e);
6490
6491         if (*l) {
6492                 /* It's kind of important, that we keep the order here */
6493                 LIST_FIND_TAIL(command, *l, end);
6494                 LIST_INSERT_AFTER(command, *l, end, e);
6495         } else
6496               *l = e;
6497 }
6498
6499 int exec_command_set(ExecCommand *c, const char *path, ...) {
6500         va_list ap;
6501         char **l, *p;
6502
6503         assert(c);
6504         assert(path);
6505
6506         va_start(ap, path);
6507         l = strv_new_ap(path, ap);
6508         va_end(ap);
6509
6510         if (!l)
6511                 return -ENOMEM;
6512
6513         p = strdup(path);
6514         if (!p) {
6515                 strv_free(l);
6516                 return -ENOMEM;
6517         }
6518
6519         free_and_replace(c->path, p);
6520
6521         return strv_free_and_replace(c->argv, l);
6522 }
6523
6524 int exec_command_append(ExecCommand *c, const char *path, ...) {
6525         _cleanup_strv_free_ char **l = NULL;
6526         va_list ap;
6527         int r;
6528
6529         assert(c);
6530         assert(path);
6531
6532         va_start(ap, path);
6533         l = strv_new_ap(path, ap);
6534         va_end(ap);
6535
6536         if (!l)
6537                 return -ENOMEM;
6538
6539         r = strv_extend_strv(&c->argv, l, false);
6540         if (r < 0)
6541                 return r;
6542
6543         return 0;
6544 }
6545
6546 static void *remove_tmpdir_thread(void *p) {
6547         _cleanup_free_ char *path = p;
6548
6549         (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
6550         return NULL;
6551 }
6552
6553 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
6554         int r;
6555
6556         if (!rt)
6557                 return NULL;
6558
6559         if (rt->manager)
6560                 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
6561
6562         /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
6563
6564         if (destroy && rt->tmp_dir && !streq(rt->tmp_dir, RUN_SYSTEMD_EMPTY)) {
6565                 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
6566
6567                 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
6568                 if (r < 0)
6569                         log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
6570                 else
6571                         rt->tmp_dir = NULL;
6572         }
6573
6574         if (destroy && rt->var_tmp_dir && !streq(rt->var_tmp_dir, RUN_SYSTEMD_EMPTY)) {
6575                 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
6576
6577                 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
6578                 if (r < 0)
6579                         log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
6580                 else
6581                         rt->var_tmp_dir = NULL;
6582         }
6583
6584         rt->id = mfree(rt->id);
6585         rt->tmp_dir = mfree(rt->tmp_dir);
6586         rt->var_tmp_dir = mfree(rt->var_tmp_dir);
6587         safe_close_pair(rt->netns_storage_socket);
6588         safe_close_pair(rt->ipcns_storage_socket);
6589         return mfree(rt);
6590 }
6591
6592 static void exec_runtime_freep(ExecRuntime **rt) {
6593         (void) exec_runtime_free(*rt, false);
6594 }
6595
6596 static int exec_runtime_allocate(ExecRuntime **ret, const char *id) {
6597         _cleanup_free_ char *id_copy = NULL;
6598         ExecRuntime *n;
6599
6600         assert(ret);
6601
6602         id_copy = strdup(id);
6603         if (!id_copy)
6604                 return -ENOMEM;
6605
6606         n = new(ExecRuntime, 1);
6607         if (!n)
6608                 return -ENOMEM;
6609
6610         *n = (ExecRuntime) {
6611                 .id = TAKE_PTR(id_copy),
6612                 .netns_storage_socket = { -EBADF, -EBADF },
6613                 .ipcns_storage_socket = { -EBADF, -EBADF },
6614         };
6615
6616         *ret = n;
6617         return 0;
6618 }
6619
6620 static int exec_runtime_add(
6621                 Manager *m,
6622                 const char *id,
6623                 char **tmp_dir,
6624                 char **var_tmp_dir,
6625                 int netns_storage_socket[2],
6626                 int ipcns_storage_socket[2],
6627                 ExecRuntime **ret) {
6628
6629         _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
6630         int r;
6631
6632         assert(m);
6633         assert(id);
6634
6635         /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
6636
6637         r = exec_runtime_allocate(&rt, id);
6638         if (r < 0)
6639                 return r;
6640
6641         r = hashmap_ensure_put(&m->exec_runtime_by_id, &string_hash_ops, rt->id, rt);
6642         if (r < 0)
6643                 return r;
6644
6645         assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
6646         rt->tmp_dir = TAKE_PTR(*tmp_dir);
6647         rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
6648
6649         if (netns_storage_socket) {
6650                 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
6651                 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
6652         }
6653
6654         if (ipcns_storage_socket) {
6655                 rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
6656                 rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
6657         }
6658
6659         rt->manager = m;
6660
6661         if (ret)
6662                 *ret = rt;
6663         /* do not remove created ExecRuntime object when the operation succeeds. */
6664         TAKE_PTR(rt);
6665         return 0;
6666 }
6667
6668 static int exec_runtime_make(
6669                 Manager *m,
6670                 const ExecContext *c,
6671                 const char *id,
6672                 ExecRuntime **ret) {
6673
6674         _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
6675         _cleanup_close_pair_ int netns_storage_socket[2] = { -EBADF, -EBADF }, ipcns_storage_socket[2] = { -EBADF, -EBADF };
6676         int r;
6677
6678         assert(m);
6679         assert(c);
6680         assert(id);
6681
6682         /* It is not necessary to create ExecRuntime object. */
6683         if (!c->private_network && !c->private_ipc && !c->private_tmp && !c->network_namespace_path) {
6684                 *ret = NULL;
6685                 return 0;
6686         }
6687
6688         if (c->private_tmp &&
6689             !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
6690               (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
6691                prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
6692                 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
6693                 if (r < 0)
6694                         return r;
6695         }
6696
6697         if (c->private_network || c->network_namespace_path) {
6698                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
6699                         return -errno;
6700         }
6701
6702         if (c->private_ipc || c->ipc_namespace_path) {
6703                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
6704                         return -errno;
6705         }
6706
6707         r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
6708         if (r < 0)
6709                 return r;
6710
6711         return 1;
6712 }
6713
6714 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
6715         ExecRuntime *rt;
6716         int r;
6717
6718         assert(m);
6719         assert(id);
6720         assert(ret);
6721
6722         rt = hashmap_get(m->exec_runtime_by_id, id);
6723         if (rt)
6724                 /* We already have an ExecRuntime object, let's increase the ref count and reuse it */
6725                 goto ref;
6726
6727         if (!create) {
6728                 *ret = NULL;
6729                 return 0;
6730         }
6731
6732         /* If not found, then create a new object. */
6733         r = exec_runtime_make(m, c, id, &rt);
6734         if (r < 0)
6735                 return r;
6736         if (r == 0) {
6737                 /* When r == 0, it is not necessary to create ExecRuntime object. */
6738                 *ret = NULL;
6739                 return 0;
6740         }
6741
6742 ref:
6743         /* increment reference counter. */
6744         rt->n_ref++;
6745         *ret = rt;
6746         return 1;
6747 }
6748
6749 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
6750         if (!rt)
6751                 return NULL;
6752
6753         assert(rt->n_ref > 0);
6754
6755         rt->n_ref--;
6756         if (rt->n_ref > 0)
6757                 return NULL;
6758
6759         return exec_runtime_free(rt, destroy);
6760 }
6761
6762 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
6763         ExecRuntime *rt;
6764
6765         assert(m);
6766         assert(f);
6767         assert(fds);
6768
6769         HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
6770                 fprintf(f, "exec-runtime=%s", rt->id);
6771
6772                 if (rt->tmp_dir)
6773                         fprintf(f, " tmp-dir=%s", rt->tmp_dir);
6774
6775                 if (rt->var_tmp_dir)
6776                         fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
6777
6778                 if (rt->netns_storage_socket[0] >= 0) {
6779                         int copy;
6780
6781                         copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
6782                         if (copy < 0)
6783                                 return copy;
6784
6785                         fprintf(f, " netns-socket-0=%i", copy);
6786                 }
6787
6788                 if (rt->netns_storage_socket[1] >= 0) {
6789                         int copy;
6790
6791                         copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
6792                         if (copy < 0)
6793                                 return copy;
6794
6795                         fprintf(f, " netns-socket-1=%i", copy);
6796                 }
6797
6798                 if (rt->ipcns_storage_socket[0] >= 0) {
6799                         int copy;
6800
6801                         copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
6802                         if (copy < 0)
6803                                 return copy;
6804
6805                         fprintf(f, " ipcns-socket-0=%i", copy);
6806                 }
6807
6808                 if (rt->ipcns_storage_socket[1] >= 0) {
6809                         int copy;
6810
6811                         copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
6812                         if (copy < 0)
6813                                 return copy;
6814
6815                         fprintf(f, " ipcns-socket-1=%i", copy);
6816                 }
6817
6818                 fputc('\n', f);
6819         }
6820
6821         return 0;
6822 }
6823
6824 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
6825         _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
6826         ExecRuntime *rt;
6827         int r;
6828
6829         /* This is for the migration from old (v237 or earlier) deserialization text.
6830          * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
6831          * Even if the ExecRuntime object originally created by the other unit, we cannot judge
6832          * so or not from the serialized text, then we always creates a new object owned by this. */
6833
6834         assert(u);
6835         assert(key);
6836         assert(value);
6837
6838         /* Manager manages ExecRuntime objects by the unit id.
6839          * So, we omit the serialized text when the unit does not have id (yet?)... */
6840         if (isempty(u->id)) {
6841                 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
6842                 return 0;
6843         }
6844
6845         if (hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops) < 0)
6846                 return log_oom();
6847
6848         rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
6849         if (!rt) {
6850                 if (exec_runtime_allocate(&rt_create, u->id) < 0)
6851                         return log_oom();
6852
6853                 rt = rt_create;
6854         }
6855
6856         if (streq(key, "tmp-dir")) {
6857                 if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
6858                         return -ENOMEM;
6859
6860         } else if (streq(key, "var-tmp-dir")) {
6861                 if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
6862                         return -ENOMEM;
6863
6864         } else if (streq(key, "netns-socket-0")) {
6865                 int fd;
6866
6867                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
6868                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
6869                         return 0;
6870                 }
6871
6872                 safe_close(rt->netns_storage_socket[0]);
6873                 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
6874
6875         } else if (streq(key, "netns-socket-1")) {
6876                 int fd;
6877
6878                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
6879                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
6880                         return 0;
6881                 }
6882
6883                 safe_close(rt->netns_storage_socket[1]);
6884                 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
6885
6886         } else
6887                 return 0;
6888
6889         /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
6890         if (rt_create) {
6891                 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
6892                 if (r < 0) {
6893                         log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
6894                         return 0;
6895                 }
6896
6897                 rt_create->manager = u->manager;
6898
6899                 /* Avoid cleanup */
6900                 TAKE_PTR(rt_create);
6901         }
6902
6903         return 1;
6904 }
6905
6906 int exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
6907         _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
6908         char *id = NULL;
6909         int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
6910         const char *p, *v = ASSERT_PTR(value);
6911         size_t n;
6912
6913         assert(m);
6914         assert(fds);
6915
6916         n = strcspn(v, " ");
6917         id = strndupa_safe(v, n);
6918         if (v[n] != ' ')
6919                 goto finalize;
6920         p = v + n + 1;
6921
6922         v = startswith(p, "tmp-dir=");
6923         if (v) {
6924                 n = strcspn(v, " ");
6925                 tmp_dir = strndup(v, n);
6926                 if (!tmp_dir)
6927                         return log_oom();
6928                 if (v[n] != ' ')
6929                         goto finalize;
6930                 p = v + n + 1;
6931         }
6932
6933         v = startswith(p, "var-tmp-dir=");
6934         if (v) {
6935                 n = strcspn(v, " ");
6936                 var_tmp_dir = strndup(v, n);
6937                 if (!var_tmp_dir)
6938                         return log_oom();
6939                 if (v[n] != ' ')
6940                         goto finalize;
6941                 p = v + n + 1;
6942         }
6943
6944         v = startswith(p, "netns-socket-0=");
6945         if (v) {
6946                 char *buf;
6947
6948                 n = strcspn(v, " ");
6949                 buf = strndupa_safe(v, n);
6950
6951                 r = safe_atoi(buf, &netns_fdpair[0]);
6952                 if (r < 0)
6953                         return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
6954                 if (!fdset_contains(fds, netns_fdpair[0]))
6955                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6956                                                "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair[0]);
6957                 netns_fdpair[0] = fdset_remove(fds, netns_fdpair[0]);
6958                 if (v[n] != ' ')
6959                         goto finalize;
6960                 p = v + n + 1;
6961         }
6962
6963         v = startswith(p, "netns-socket-1=");
6964         if (v) {
6965                 char *buf;
6966
6967                 n = strcspn(v, " ");
6968                 buf = strndupa_safe(v, n);
6969
6970                 r = safe_atoi(buf, &netns_fdpair[1]);
6971                 if (r < 0)
6972                         return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
6973                 if (!fdset_contains(fds, netns_fdpair[1]))
6974                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6975                                                "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair[1]);
6976                 netns_fdpair[1] = fdset_remove(fds, netns_fdpair[1]);
6977                 if (v[n] != ' ')
6978                         goto finalize;
6979                 p = v + n + 1;
6980         }
6981
6982         v = startswith(p, "ipcns-socket-0=");
6983         if (v) {
6984                 char *buf;
6985
6986                 n = strcspn(v, " ");
6987                 buf = strndupa_safe(v, n);
6988
6989                 r = safe_atoi(buf, &ipcns_fdpair[0]);
6990                 if (r < 0)
6991                         return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf);
6992                 if (!fdset_contains(fds, ipcns_fdpair[0]))
6993                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6994                                                "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair[0]);
6995                 ipcns_fdpair[0] = fdset_remove(fds, ipcns_fdpair[0]);
6996                 if (v[n] != ' ')
6997                         goto finalize;
6998                 p = v + n + 1;
6999         }
7000
7001         v = startswith(p, "ipcns-socket-1=");
7002         if (v) {
7003                 char *buf;
7004
7005                 n = strcspn(v, " ");
7006                 buf = strndupa_safe(v, n);
7007
7008                 r = safe_atoi(buf, &ipcns_fdpair[1]);
7009                 if (r < 0)
7010                         return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf);
7011                 if (!fdset_contains(fds, ipcns_fdpair[1]))
7012                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7013                                                "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair[1]);
7014                 ipcns_fdpair[1] = fdset_remove(fds, ipcns_fdpair[1]);
7015         }
7016
7017 finalize:
7018         r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
7019         if (r < 0)
7020                 return log_debug_errno(r, "Failed to add exec-runtime: %m");
7021         return 0;
7022 }
7023
7024 void exec_runtime_vacuum(Manager *m) {
7025         ExecRuntime *rt;
7026
7027         assert(m);
7028
7029         /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
7030
7031         HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
7032                 if (rt->n_ref > 0)
7033                         continue;
7034
7035                 (void) exec_runtime_free(rt, false);
7036         }
7037 }
7038
7039 void exec_params_clear(ExecParameters *p) {
7040         if (!p)
7041                 return;
7042
7043         p->environment = strv_free(p->environment);
7044         p->fd_names = strv_free(p->fd_names);
7045         p->fds = mfree(p->fds);
7046         p->exec_fd = safe_close(p->exec_fd);
7047 }
7048
7049 ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) {
7050         if (!sc)
7051                 return NULL;
7052
7053         free(sc->id);
7054         free(sc->data);
7055         return mfree(sc);
7056 }
7057
7058 ExecLoadCredential *exec_load_credential_free(ExecLoadCredential *lc) {
7059         if (!lc)
7060                 return NULL;
7061
7062         free(lc->id);
7063         free(lc->path);
7064         return mfree(lc);
7065 }
7066
7067 void exec_directory_done(ExecDirectory *d) {
7068         if (!d)
7069                 return;
7070
7071         for (size_t i = 0; i < d->n_items; i++) {
7072                 free(d->items[i].path);
7073                 strv_free(d->items[i].symlinks);
7074         }
7075
7076         d->items = mfree(d->items);
7077         d->n_items = 0;
7078         d->mode = 0755;
7079 }
7080
7081 static ExecDirectoryItem *exec_directory_find(ExecDirectory *d, const char *path) {
7082         assert(d);
7083         assert(path);
7084
7085         for (size_t i = 0; i < d->n_items; i++)
7086                 if (path_equal(d->items[i].path, path))
7087                         return &d->items[i];
7088
7089         return NULL;
7090 }
7091
7092 int exec_directory_add(ExecDirectory *d, const char *path, const char *symlink) {
7093         _cleanup_strv_free_ char **s = NULL;
7094         _cleanup_free_ char *p = NULL;
7095         ExecDirectoryItem *existing;
7096         int r;
7097
7098         assert(d);
7099         assert(path);
7100
7101         existing = exec_directory_find(d, path);
7102         if (existing) {
7103                 r = strv_extend(&existing->symlinks, symlink);
7104                 if (r < 0)
7105                         return r;
7106
7107                 return 0; /* existing item is updated */
7108         }
7109
7110         p = strdup(path);
7111         if (!p)
7112                 return -ENOMEM;
7113
7114         if (symlink) {
7115                 s = strv_new(symlink);
7116                 if (!s)
7117                         return -ENOMEM;
7118         }
7119
7120         if (!GREEDY_REALLOC(d->items, d->n_items + 1))
7121                 return -ENOMEM;
7122
7123         d->items[d->n_items++] = (ExecDirectoryItem) {
7124                 .path = TAKE_PTR(p),
7125                 .symlinks = TAKE_PTR(s),
7126         };
7127
7128         return 1; /* new item is added */
7129 }
7130
7131 static int exec_directory_item_compare_func(const ExecDirectoryItem *a, const ExecDirectoryItem *b) {
7132         assert(a);
7133         assert(b);
7134
7135         return path_compare(a->path, b->path);
7136 }
7137
7138 void exec_directory_sort(ExecDirectory *d) {
7139         assert(d);
7140
7141         /* Sort the exec directories to make always parent directories processed at first in
7142          * setup_exec_directory(), e.g., even if StateDirectory=foo/bar foo, we need to create foo at first,
7143          * then foo/bar. Also, set .only_create flag if one of the parent directories is contained in the
7144          * list. See also comments in setup_exec_directory() and issue #24783. */
7145
7146         if (d->n_items <= 1)
7147                 return;
7148
7149         typesafe_qsort(d->items, d->n_items, exec_directory_item_compare_func);
7150
7151         for (size_t i = 1; i < d->n_items; i++)
7152                 for (size_t j = 0; j < i; j++)
7153                         if (path_startswith(d->items[i].path, d->items[j].path)) {
7154                                 d->items[i].only_create = true;
7155                                 break;
7156                         }
7157 }
7158
7159 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_set_credential_hash_ops, char, string_hash_func, string_compare_func, ExecSetCredential, exec_set_credential_free);
7160 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_load_credential_hash_ops, char, string_hash_func, string_compare_func, ExecLoadCredential, exec_load_credential_free);
7161
7162 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
7163         [EXEC_INPUT_NULL] = "null",
7164         [EXEC_INPUT_TTY] = "tty",
7165         [EXEC_INPUT_TTY_FORCE] = "tty-force",
7166         [EXEC_INPUT_TTY_FAIL] = "tty-fail",
7167         [EXEC_INPUT_SOCKET] = "socket",
7168         [EXEC_INPUT_NAMED_FD] = "fd",
7169         [EXEC_INPUT_DATA] = "data",
7170         [EXEC_INPUT_FILE] = "file",
7171 };
7172
7173 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
7174
7175 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
7176         [EXEC_OUTPUT_INHERIT] = "inherit",
7177         [EXEC_OUTPUT_NULL] = "null",
7178         [EXEC_OUTPUT_TTY] = "tty",
7179         [EXEC_OUTPUT_KMSG] = "kmsg",
7180         [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
7181         [EXEC_OUTPUT_JOURNAL] = "journal",
7182         [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
7183         [EXEC_OUTPUT_SOCKET] = "socket",
7184         [EXEC_OUTPUT_NAMED_FD] = "fd",
7185         [EXEC_OUTPUT_FILE] = "file",
7186         [EXEC_OUTPUT_FILE_APPEND] = "append",
7187         [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
7188 };
7189
7190 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
7191
7192 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
7193         [EXEC_UTMP_INIT] = "init",
7194         [EXEC_UTMP_LOGIN] = "login",
7195         [EXEC_UTMP_USER] = "user",
7196 };
7197
7198 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
7199
7200 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
7201         [EXEC_PRESERVE_NO] = "no",
7202         [EXEC_PRESERVE_YES] = "yes",
7203         [EXEC_PRESERVE_RESTART] = "restart",
7204 };
7205
7206 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
7207
7208 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
7209 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7210         [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
7211         [EXEC_DIRECTORY_STATE] = "StateDirectory",
7212         [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
7213         [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
7214         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
7215 };
7216
7217 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
7218
7219 /* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
7220 static const char* const exec_directory_type_symlink_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7221         [EXEC_DIRECTORY_RUNTIME]       = "RuntimeDirectorySymlink",
7222         [EXEC_DIRECTORY_STATE]         = "StateDirectorySymlink",
7223         [EXEC_DIRECTORY_CACHE]         = "CacheDirectorySymlink",
7224         [EXEC_DIRECTORY_LOGS]          = "LogsDirectorySymlink",
7225         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectorySymlink",
7226 };
7227
7228 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink, ExecDirectoryType);
7229
7230 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
7231  * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
7232  * directories, specifically .timer units with their timestamp touch file. */
7233 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7234         [EXEC_DIRECTORY_RUNTIME] = "runtime",
7235         [EXEC_DIRECTORY_STATE] = "state",
7236         [EXEC_DIRECTORY_CACHE] = "cache",
7237         [EXEC_DIRECTORY_LOGS] = "logs",
7238         [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
7239 };
7240
7241 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
7242
7243 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
7244  * the service payload in. */
7245 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7246         [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
7247         [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
7248         [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
7249         [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
7250         [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
7251 };
7252
7253 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
7254
7255 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
7256         [EXEC_KEYRING_INHERIT] = "inherit",
7257         [EXEC_KEYRING_PRIVATE] = "private",
7258         [EXEC_KEYRING_SHARED] = "shared",
7259 };
7260
7261 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);